diff --git a/clang/lib/Basic/Targets/AMDGPU.h b/clang/lib/Basic/Targets/AMDGPU.h --- a/clang/lib/Basic/Targets/AMDGPU.h +++ b/clang/lib/Basic/Targets/AMDGPU.h @@ -115,6 +115,9 @@ return getTriple().getArch() == llvm::Triple::amdgcn ? 64 : 32; } + bool hasBFloat16Type() const override { return true; } + const char *getBFloat16Mangling() const override { return "u6__bf16"; }; + const char *getClobbers() const override { return ""; } ArrayRef getGCCRegNames() const override; diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp --- a/clang/lib/Basic/Targets/AMDGPU.cpp +++ b/clang/lib/Basic/Targets/AMDGPU.cpp @@ -365,6 +365,10 @@ !isAMDGCN(Triple)); UseAddrSpaceMapMangling = true; + // __bf16 is always available as a load/store only type. + BFloat16Width = BFloat16Align = 16; + BFloat16Format = &llvm::APFloat::BFloat(); + HasLegalHalfType = true; HasFloat16 = true; WavefrontSize = GPUFeatures & llvm::AMDGPU::FEATURE_WAVE32 ? 32 : 64; diff --git a/clang/test/CodeGenCUDA/amdgpu-bf16.cu b/clang/test/CodeGenCUDA/amdgpu-bf16.cu new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenCUDA/amdgpu-bf16.cu @@ -0,0 +1,59 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// REQUIRES: x86-registered-target + +// RUN: %clang_cc1 "-aux-triple" "x86_64-unknown-linux-gnu" "-triple" "amdgcn-amd-amdhsa" \ +// RUN: -fcuda-is-device "-aux-target-cpu" "x86-64" -emit-llvm -o - %s | FileCheck %s + +#include "Inputs/cuda.h" + +// CHECK-LABEL: @_Z8test_argPu6__bf16u6__bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[IN_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5) +// CHECK-NEXT: [[BF16:%.*]] = alloca bfloat, align 2, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr +// CHECK-NEXT: [[BF16_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BF16]] to ptr +// CHECK-NEXT: store ptr [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store bfloat [[IN:%.*]], ptr [[IN_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[IN_ADDR_ASCAST]], align 2 +// CHECK-NEXT: store bfloat [[TMP0]], ptr [[BF16_ASCAST]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[BF16_ASCAST]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store bfloat [[TMP1]], ptr [[TMP2]], align 2 +// CHECK-NEXT: ret void +// +__device__ void test_arg(__bf16 *out, __bf16 in) { + __bf16 bf16 = in; + *out = bf16; +} + +// CHECK-LABEL: @_Z8test_retu6__bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca bfloat, align 2, addrspace(5) +// CHECK-NEXT: [[IN_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr +// CHECK-NEXT: store bfloat [[IN:%.*]], ptr [[IN_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[IN_ADDR_ASCAST]], align 2 +// CHECK-NEXT: ret bfloat [[TMP0]] +// +__device__ __bf16 test_ret( __bf16 in) { + return in; +} + +// CHECK-LABEL: @_Z9test_callu6__bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca bfloat, align 2, addrspace(5) +// CHECK-NEXT: [[IN_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr +// CHECK-NEXT: store bfloat [[IN:%.*]], ptr [[IN_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[IN_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[CALL:%.*]] = call contract noundef bfloat @_Z8test_retu6__bf16(bfloat noundef [[TMP0]]) #[[ATTR1:[0-9]+]] +// CHECK-NEXT: ret bfloat [[CALL]] +// +__device__ __bf16 test_call( __bf16 in) { + return test_ret(in); +} diff --git a/clang/test/SemaCUDA/amdgpu-bf16.cu b/clang/test/SemaCUDA/amdgpu-bf16.cu new file mode 100644 --- /dev/null +++ b/clang/test/SemaCUDA/amdgpu-bf16.cu @@ -0,0 +1,41 @@ +// REQUIRES: amdgpu-registered-target +// REQUIRES: x86-registered-target + +// RUN: %clang_cc1 "-triple" "x86_64-unknown-linux-gnu" "-aux-triple" "amdgcn-amd-amdhsa"\ +// RUN: "-target-cpu" "x86-64" -fsyntax-only -verify=scalar %s +// RUN: %clang_cc1 "-aux-triple" "x86_64-unknown-linux-gnu" "-triple" "amdgcn-amd-amdhsa"\ +// RUN: -fcuda-is-device "-aux-target-cpu" "x86-64" -fsyntax-only -verify=scalar %s + +// AMDGPU has storage-only support for bf16. + +#include "Inputs/cuda.h" + +__device__ void test(bool b, __bf16 *out, __bf16 in) { + __bf16 bf16 = in; // No error on using the type itself. + + bf16 + bf16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__bf16')}} + bf16 - bf16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__bf16')}} + bf16 * bf16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__bf16')}} + bf16 / bf16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__bf16')}} + + __fp16 fp16; + + bf16 + fp16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__fp16')}} + fp16 + bf16; // scalar-error {{invalid operands to binary expression ('__fp16' and '__bf16')}} + bf16 - fp16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__fp16')}} + fp16 - bf16; // scalar-error {{invalid operands to binary expression ('__fp16' and '__bf16')}} + bf16 * fp16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__fp16')}} + fp16 * bf16; // scalar-error {{invalid operands to binary expression ('__fp16' and '__bf16')}} + bf16 / fp16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__fp16')}} + fp16 / bf16; // scalar-error {{invalid operands to binary expression ('__fp16' and '__bf16')}} + bf16 = fp16; // scalar-error {{assigning to '__bf16' from incompatible type '__fp16'}} + fp16 = bf16; // scalar-error {{assigning to '__fp16' from incompatible type '__bf16'}} + bf16 + (b ? fp16 : bf16); // scalar-error {{incompatible operand types ('__fp16' and '__bf16')}} + *out = bf16; +} + +__bf16 hostfn(__bf16 a) { + return a; +} + +typedef __bf16 foo __attribute__((__vector_size__(16), __aligned__(16))); \ No newline at end of file diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td --- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -17,6 +17,9 @@ // Calling convention for SI def CC_SI_Gfx : CallingConv<[ + CCIfType<[bf16], CCBitConvertToType>, + CCIfType<[v2bf16], CCBitConvertToType>, + // 0-3 are reserved for the stack buffer descriptor // 30-31 are reserved for the return address // 32 is reserved for the stack pointer @@ -42,7 +45,8 @@ def RetCC_SI_Gfx : CallingConv<[ CCIfType<[i1], CCPromoteToType>, CCIfType<[i1, i16], CCIfExtend>>, - + CCIfType<[bf16], CCBitConvertToType>, + CCIfType<[v2bf16], CCBitConvertToType>, CCIfNotInReg; def CC_SI_SHADER : CallingConv<[ + CCIfType<[bf16], CCBitConvertToType>, + CCIfType<[v2bf16], CCBitConvertToType>, CCIfInReg>>, + CCIfType<[bf16], CCBitConvertToType>, + CCIfType<[v2bf16], CCBitConvertToType>, CCIfType<[i32, i16] , CCAssignToReg<[ SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7, SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15, @@ -183,6 +191,8 @@ CCIfByVal>, CCIfType<[i1], CCPromoteToType>, CCIfType<[i8, i16], CCIfExtend>>, + CCIfType<[bf16], CCBitConvertToType>, + CCIfType<[v2bf16], CCBitConvertToType>, CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1], CCAssignToReg<[ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, @@ -195,6 +205,8 @@ def RetCC_AMDGPU_Func : CallingConv<[ CCIfType<[i1], CCPromoteToType>, CCIfType<[i1, i16], CCIfExtend>>, + CCIfType<[bf16], CCBitConvertToType>, + CCIfType<[v2bf16], CCBitConvertToType>, CCIfType<[i32, f32, i16, f16, v2i16, v2f16], CCAssignToReg<[ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -163,6 +163,7 @@ Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); @@ -178,6 +179,7 @@ setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); @@ -272,6 +274,7 @@ setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand); setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand); + setTruncStoreAction(MVT::f32, MVT::bf16, Expand); setTruncStoreAction(MVT::f32, MVT::f16, Expand); setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand); setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -144,16 +144,21 @@ if (Subtarget->has16BitInsts()) { addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass); addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass); + addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass); // Unless there are also VOP3P operations, not operations are really legal. addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass); addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass); + addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass); addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass); addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass); + addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass); addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass); addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass); + addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass); addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass); addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass); + addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass); } addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); @@ -256,13 +261,13 @@ // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. for (MVT VT : - {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32, - MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32, - MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16, - MVT::v4f16, MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32, - MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, MVT::v8i16, - MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v16i64, MVT::v16f64, - MVT::v32i32, MVT::v32f32}) { + {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32, + MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32, + MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16, + MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32, + MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, + MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v16bf16, + MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -475,6 +480,9 @@ setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom); setOperationAction(ISD::FDIV, MVT::f64, Custom); + setOperationAction(ISD::BF16_TO_FP, MVT::i16, Custom); + setOperationAction(ISD::FP_TO_BF16, MVT::i16, Custom); + if (Subtarget->has16BitInsts()) { setOperationAction({ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT}, @@ -502,6 +510,15 @@ setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::i16, Custom); + // BF16 - Constant Actions. + setOperationAction(ISD::ConstantFP, MVT::bf16, Legal); + + // BF16 - Load/store Actions. + setOperationAction(ISD::LOAD, MVT::bf16, Promote); + AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16); + setOperationAction(ISD::STORE, MVT::bf16, Promote); + AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16); + // F16 - Constant Actions. setOperationAction(ISD::ConstantFP, MVT::f16, Legal); @@ -571,11 +588,15 @@ AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32); setOperationAction(ISD::STORE, MVT::v2f16, Promote); AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32); + setOperationAction(ISD::STORE, MVT::v2bf16, Promote); + AddPromotedToType(ISD::STORE, MVT::v2bf16, MVT::i32); setOperationAction(ISD::LOAD, MVT::v2i16, Promote); AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32); setOperationAction(ISD::LOAD, MVT::v2f16, Promote); AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32); + setOperationAction(ISD::LOAD, MVT::v2bf16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v2bf16, MVT::i32); setOperationAction(ISD::AND, MVT::v2i16, Promote); AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32); @@ -588,36 +609,50 @@ AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32); setOperationAction(ISD::LOAD, MVT::v4f16, Promote); AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32); + setOperationAction(ISD::LOAD, MVT::v4bf16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32); setOperationAction(ISD::STORE, MVT::v4i16, Promote); AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32); setOperationAction(ISD::STORE, MVT::v4f16, Promote); AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32); + setOperationAction(ISD::STORE, MVT::v4bf16, Promote); + AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32); setOperationAction(ISD::LOAD, MVT::v8i16, Promote); AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32); setOperationAction(ISD::LOAD, MVT::v8f16, Promote); AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32); + setOperationAction(ISD::LOAD, MVT::v8bf16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32); setOperationAction(ISD::STORE, MVT::v4i16, Promote); AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32); setOperationAction(ISD::STORE, MVT::v4f16, Promote); AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32); + setOperationAction(ISD::STORE, MVT::v4bf16, Promote); + AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32); setOperationAction(ISD::STORE, MVT::v8i16, Promote); AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32); setOperationAction(ISD::STORE, MVT::v8f16, Promote); AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32); + setOperationAction(ISD::STORE, MVT::v8bf16, Promote); + AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32); setOperationAction(ISD::LOAD, MVT::v16i16, Promote); AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32); setOperationAction(ISD::LOAD, MVT::v16f16, Promote); AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32); + setOperationAction(ISD::LOAD, MVT::v16bf16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32); setOperationAction(ISD::STORE, MVT::v16i16, Promote); AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32); setOperationAction(ISD::STORE, MVT::v16f16, Promote); AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32); + setOperationAction(ISD::STORE, MVT::v16bf16, Promote); + AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32); setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND}, MVT::v2i32, Expand); @@ -4780,6 +4815,21 @@ return lowerXMUL_LOHI(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); + case ISD::BF16_TO_FP: { + // When we don't have 16 bit instructions, bf16 is illegal and gets + // softened to i16 for storage, with float being used for arithmetic. + // + // After softening, some i16 -> fp32 bf16_to_fp operations can be left over. + // Lower those to (f32 (fp_extend (f16 (bitconvert x)))) + if (!Op->getValueType(0).isFloatingPoint() || + Op->getOperand(0).getValueType() != MVT::i16) + break; + + SDLoc SL(Op); + return DAG.getNode( + ISD::FP_EXTEND, SL, MVT::f32, + DAG.getNode(ISD::BITCAST, SL, MVT::f16, Op->getOperand(0))); + } } return SDValue(); } @@ -5131,6 +5181,22 @@ Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op)); return; } + case ISD::FP_TO_BF16: { + // When we don't have 16 bit instructions, bf16 is illegal and gets + // softened to i16 for storage, with float being used for arithmetic. + // + // After softening, fp_to_bf16 can be emitted, but with a i16 VT instead. + // Of course those won't work, so we handle them here by lowering them + // to (i16 (bitconvert (f32 (fptrunc x)))) + if (N->getValueType(0) != MVT::i16) + break; + + SDLoc SL(N); + Results.push_back( + DAG.getNode(ISD::BITCAST, SL, MVT::i16, + DAG.getFPExtendOrRound(N->getOperand(0), SL, MVT::f16))); + return; + } default: break; } diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1427,6 +1427,8 @@ def : BitConvert ; def : BitConvert ; def : BitConvert ; +def : BitConvert ; +def : BitConvert ; // 32-bit bitcast def : BitConvert ; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -381,7 +381,7 @@ let HasSGPR = 1; } -def M0_CLASS_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add M0_LO16)> { +def M0_CLASS_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16, (add M0_LO16)> { let CopyCost = 1; let Size = 16; let isAllocatable = 0; @@ -390,7 +390,7 @@ // TODO: Do we need to set DwarfRegAlias on register tuples? -def SGPR_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, +def SGPR_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16, (add (sequence "SGPR%u_LO16", 0, 105))> { let AllocationPriority = 0; let Size = 16; @@ -398,7 +398,7 @@ let HasSGPR = 1; } -def SGPR_HI16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, +def SGPR_HI16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16, (add (sequence "SGPR%u_HI16", 0, 105))> { let isAllocatable = 0; let Size = 16; @@ -407,7 +407,7 @@ } // SGPR 32-bit registers -def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, +def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32, (add (sequence "SGPR%u", 0, 105))> { // Give all SGPR classes higher priority than VGPR classes, because // we want to spill SGPRs to VGPRs. @@ -456,14 +456,14 @@ def SGPR_1024Regs : SIRegisterTuples.ret, SGPR_32, 105, 4, 32, "s">; // Trap handler TMP 32-bit registers -def TTMP_32 : SIRegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32, +def TTMP_32 : SIRegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16, v2bf16], 32, (add (sequence "TTMP%u", 0, 15))> { let isAllocatable = 0; let HasSGPR = 1; } // Trap handler TMP 16-bit registers -def TTMP_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, +def TTMP_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16, (add (sequence "TTMP%u_LO16", 0, 15))> { let Size = 16; let isAllocatable = 0; @@ -589,8 +589,8 @@ list types = reg_types; } -def Reg16Types : RegisterTypes<[i16, f16]>; -def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, p2, p3, p5, p6]>; +def Reg16Types : RegisterTypes<[i16, f16, bf16]>; +def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, v2bf16, p2, p3, p5, p6]>; let HasVGPR = 1 in { def VGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, @@ -674,7 +674,7 @@ } // AccVGPR 32-bit registers -def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, +def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32, (add (sequence "AGPR%u", 0, 255))> { let AllocationPriority = 0; let Size = 32; @@ -725,14 +725,14 @@ // Register classes used as source and destination //===----------------------------------------------------------------------===// -def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, +def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32, (add FP_REG, SP_REG)> { let isAllocatable = 0; let CopyCost = -1; let HasSGPR = 1; } -def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64, v8i16, v8f16], 32, +def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64, v8i16, v8f16, v8bf16], 32, (add PRIVATE_RSRC_REG)> { let isAllocatable = 0; let CopyCost = -1; @@ -748,7 +748,7 @@ let GeneratePressureSet = 0, HasSGPR = 1 in { // Subset of SReg_32 without M0 for SMRD instructions and alike. // See comments in SIInstructions.td for more info. -def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, +def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32, (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI, SGPR_NULL, SGPR_NULL_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE_LO, SRC_SHARED_LIMIT_LO, SRC_PRIVATE_BASE_LO, SRC_PRIVATE_LIMIT_LO, SRC_SHARED_BASE_HI, @@ -757,7 +757,7 @@ let AllocationPriority = 0; } -def SReg_LO16_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i16, f16], 16, +def SReg_LO16_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16, (add SGPR_LO16, VCC_LO_LO16, VCC_HI_LO16, FLAT_SCR_LO_LO16, FLAT_SCR_HI_LO16, XNACK_MASK_LO_LO16, XNACK_MASK_HI_LO16, SGPR_NULL_LO16, SGPR_NULL_HI_LO16, TTMP_LO16, TMA_LO_LO16, TMA_HI_LO16, TBA_LO_LO16, TBA_HI_LO16, SRC_SHARED_BASE_LO_LO16, @@ -769,29 +769,29 @@ let AllocationPriority = 0; } -def SReg_32_XEXEC_HI : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, +def SReg_32_XEXEC_HI : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32, (add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS)> { let AllocationPriority = 0; } -def SReg_LO16_XEXEC_HI : SIRegisterClass<"AMDGPU", [i16, f16], 16, +def SReg_LO16_XEXEC_HI : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16, (add SReg_LO16_XM0_XEXEC, EXEC_LO_LO16, M0_CLASS_LO16)> { let Size = 16; let AllocationPriority = 0; } -def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, +def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32, (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> { let AllocationPriority = 0; } -def SReg_LO16_XM0 : SIRegisterClass<"AMDGPU", [i16, f16], 16, +def SReg_LO16_XM0 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16, (add SReg_LO16_XM0_XEXEC, EXEC_LO_LO16, EXEC_HI_LO16)> { let Size = 16; let AllocationPriority = 0; } -def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, +def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16, (add SGPR_LO16, SReg_LO16_XM0, M0_CLASS_LO16, EXEC_LO_LO16, EXEC_HI_LO16, SReg_LO16_XEXEC_HI)> { let Size = 16; let AllocationPriority = 0; @@ -799,33 +799,33 @@ } // End GeneratePressureSet = 0 // Register class for all scalar registers (SGPRs + Special Registers) -def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, +def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32, (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI)> { let AllocationPriority = 0; let HasSGPR = 1; } let GeneratePressureSet = 0 in { -def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, +def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32, (add SReg_32, LDS_DIRECT_CLASS)> { let isAllocatable = 0; let HasSGPR = 1; } -def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32, +def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16, v4bf16], 32, (add SGPR_64Regs)> { let CopyCost = 1; let AllocationPriority = 1; let HasSGPR = 1; } -def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, +def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16, v4bf16], 32, (add TTMP_64Regs)> { let isAllocatable = 0; let HasSGPR = 1; } -def SReg_64_XEXEC : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32, +def SReg_64_XEXEC : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16, v4bf16], 32, (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, SGPR_NULL64, SRC_SHARED_BASE, SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, TTMP_64, TBA, TMA)> { let CopyCost = 1; @@ -833,7 +833,7 @@ let HasSGPR = 1; } -def SReg_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32, +def SReg_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16, v4bf16], 32, (add SReg_64_XEXEC, EXEC)> { let CopyCost = 1; let AllocationPriority = 1; @@ -886,11 +886,11 @@ } defm "" : SRegClass<3, [v3i32, v3f32], SGPR_96Regs, TTMP_96Regs>; -defm "" : SRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], SGPR_128Regs, TTMP_128Regs>; +defm "" : SRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16], SGPR_128Regs, TTMP_128Regs>; defm "" : SRegClass<5, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>; defm "" : SRegClass<6, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>; defm "" : SRegClass<7, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>; -defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], SGPR_256Regs, TTMP_256Regs>; +defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16, v16bf16], SGPR_256Regs, TTMP_256Regs>; defm "" : SRegClass<9, [v9i32, v9f32], SGPR_288Regs, TTMP_288Regs>; defm "" : SRegClass<10, [v10i32, v10f32], SGPR_320Regs, TTMP_320Regs>; defm "" : SRegClass<11, [v11i32, v11f32], SGPR_352Regs, TTMP_352Regs>; @@ -901,7 +901,7 @@ defm "" : SRegClass<32, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>; } -def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, +def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32, (add VGPR_32, LDS_DIRECT_CLASS)> { let isAllocatable = 0; let HasVGPR = 1; @@ -930,15 +930,15 @@ } } -defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4], +defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4bf16, v4i16, p0, p1, p4], (add VGPR_64)>; defm VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>; -defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], (add VGPR_128)>; +defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16], (add VGPR_128)>; defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>; defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>; defm VReg_224 : VRegClass<7, [v7i32, v7f32], (add VGPR_224)>; -defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], (add VGPR_256)>; +defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16, v16bf16], (add VGPR_256)>; defm VReg_288 : VRegClass<9, [v9i32, v9f32], (add VGPR_288)>; defm VReg_320 : VRegClass<10, [v10i32, v10f32], (add VGPR_320)>; defm VReg_352 : VRegClass<11, [v11i32, v11f32], (add VGPR_352)>; @@ -959,10 +959,10 @@ } } -defm AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16], +defm AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4bf16, v4i16], (add AGPR_64)>; defm AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>; -defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], (add AGPR_128)>; +defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16], (add AGPR_128)>; defm AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>; defm AReg_192 : ARegClass<6, [v6i32, v6f32, v3i64, v3f64], (add AGPR_192)>; defm AReg_224 : ARegClass<7, [v7i32, v7f32], (add AGPR_224)>; @@ -989,14 +989,14 @@ let HasVGPR = 1; } -def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, +def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32, (add VGPR_32, SReg_32, LDS_DIRECT_CLASS)> { let isAllocatable = 0; let HasVGPR = 1; let HasSGPR = 1; } -def VS_32_Lo128 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, +def VS_32_Lo128 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32, (add VGPR_32_Lo128, SReg_32, LDS_DIRECT_CLASS)> { let isAllocatable = 0; let HasVGPR = 1; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -161,9 +161,9 @@ // because dealing with the write to high half of the register is // difficult. def : GCNPat < - (build_vector f16:$elt0, (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), + (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))), (v2f16 (mixhi_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, @@ -174,9 +174,9 @@ def : GCNPat < (build_vector f16:$elt0, - (AMDGPUclamp (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))), + (AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))), (v2f16 (mixhi_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, @@ -186,12 +186,12 @@ def : GCNPat < (AMDGPUclamp (build_vector - (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))), - (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))), + (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers))))), + (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers))))))), (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0, $hi_src1_modifiers, $hi_src1, $hi_src2_modifiers, $hi_src2, diff --git a/llvm/test/CodeGen/AMDGPU/bf16-ops.ll b/llvm/test/CodeGen/AMDGPU/bf16-ops.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/bf16-ops.ll @@ -0,0 +1,32 @@ +; RUN: not llc < %s -march=amdgcn -mcpu=hawaii -verify-machineinstrs +; RUN: not llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs +; RUN: not llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs +; RUN: not llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs + +; TODO: Add GlobalISel tests, currently it silently miscompiles as GISel does not handle BF16 at all. + +; We only have storage-only BF16 support so check codegen fails if we attempt to do operations on bfloats. + +define void @test_fneg(bfloat %a, bfloat addrspace(1)* %out) { + %result = fneg bfloat %a + store bfloat %result, bfloat addrspace(1) * %out + ret void +} + +define void @test_fabs(bfloat %a, bfloat addrspace(1)* %out) { + %result = fabs bfloat %a + store bfloat %result, bfloat addrspace(1) * %out + ret void +} + +define void @test_add(bfloat %a, bfloat %b, bfloat addrspace(1)* %out) { + %result = fadd bfloat %a, %b + store bfloat %result, bfloat addrspace(1) * %out + ret void +} + +define void @test_mul(bfloat %a, bfloat %b, bfloat addrspace(1)* %out) { + %result = fmul bfloat %a, %b + store bfloat %result, bfloat addrspace(1) * %out + ret void +} \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -0,0 +1,956 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s -check-prefixes=GCN +; RUN: llc < %s -march=amdgcn -mcpu=hawaii -verify-machineinstrs | FileCheck %s -check-prefixes=GFX7 +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefixes=GFX8 +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX9 +; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10 + +; We only have storage-only BF16 support. We can load/store those values as we treat them as u16, but +; we don't support operations on them. As such, codegen is expected to fail for any operation other +; than simple load/stores. + +define void @test_load_store(bfloat addrspace(1)* %in, bfloat addrspace(1)* %out) { +; GCN-LABEL: test_load_store: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_load_store: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_load_store: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_short v[2:3], v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_load_store: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_short v[2:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_load_store: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_ushort v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_short v[2:3], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %val = load bfloat, bfloat addrspace(1)* %in + store bfloat %val, bfloat addrspace(1) * %out + ret void +} + +define void @test_load_store_v2bf16(<2 x bfloat> addrspace(1)* %in, <2 x bfloat> addrspace(1)* %out) { +; GCN-LABEL: test_load_store_v2bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_load_store_v2bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_load_store_v2bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dword v[2:3], v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_load_store_v2bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_load_store_v2bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %val = load <2 x bfloat>, <2 x bfloat> addrspace(1)* %in + store <2 x bfloat> %val, <2 x bfloat> addrspace(1) * %out + ret void +} + +define void @test_load_store_v4bf16(<4 x bfloat> addrspace(1)* %in, <4 x bfloat> addrspace(1)* %out) { +; GCN-LABEL: test_load_store_v4bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_load_store_v4bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_load_store_v4bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_load_store_v4bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_load_store_v4bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %val = load <4 x bfloat>, <4 x bfloat> addrspace(1)* %in + store <4 x bfloat> %val, <4 x bfloat> addrspace(1) * %out + ret void +} + +define void @test_load_store_v8bf16(<8 x bfloat> addrspace(1)* %in, <8 x bfloat> addrspace(1)* %out) { +; GCN-LABEL: test_load_store_v8bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_load_store_v8bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_load_store_v8bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_load_store_v8bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v[2:3], v[4:7], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_load_store_v8bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dwordx4 v[2:3], v[4:7], off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %val = load <8 x bfloat>, <8 x bfloat> addrspace(1)* %in + store <8 x bfloat> %val, <8 x bfloat> addrspace(1) * %out + ret void +} + +define void @test_load_store_v16bf16(<16 x bfloat> addrspace(1)* %in, <16 x bfloat> addrspace(1)* %out) { +; GCN-LABEL: test_load_store_v16bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_load_store_v16bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16 +; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_load_store_v16bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_load_store_v16bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: global_store_dwordx4 v[2:3], v[4:7], off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: global_store_dwordx4 v[2:3], v[8:11], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_load_store_v16bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 +; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: global_store_dwordx4 v[2:3], v[4:7], off offset:16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dwordx4 v[2:3], v[8:11], off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %val = load <16 x bfloat>, <16 x bfloat> addrspace(1)* %in + store <16 x bfloat> %val, <16 x bfloat> addrspace(1) * %out + ret void +} + +define void @test_arg_store(bfloat %in, bfloat addrspace(1)* %out) { +; GCN-LABEL: test_arg_store: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_short v0, v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_arg_store: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_store_short v0, v[1:2], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_arg_store: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_store_short v[1:2], v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_arg_store: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_store_short v[1:2], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_arg_store: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_store_short v[1:2], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + store bfloat %in, bfloat addrspace(1) * %out + ret void +} + +define void @test_arg_store_v2bf16(<2 x bfloat> %in, <2 x bfloat> addrspace(1)* %out) { +; GCN-LABEL: test_arg_store_v2bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_arg_store_v2bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_arg_store_v2bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_store_dword v[1:2], v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_arg_store_v2bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_store_dword v[1:2], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_arg_store_v2bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_store_dword v[1:2], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + store <2 x bfloat> %in, <2 x bfloat> addrspace(1) * %out + ret void +} + +define void @test_arg_store_v4bf16(<4 x bfloat> %in, <4 x bfloat> addrspace(1)* %out) { +; GCN-LABEL: test_arg_store_v4bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v3 +; GCN-NEXT: v_or_b32_e32 v0, v0, v6 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_arg_store_v4bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_arg_store_v4bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_arg_store_v4bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_arg_store_v4bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + store <4 x bfloat> %in, <4 x bfloat> addrspace(1) * %out + ret void +} + +define void @test_arg_store_v8bf16(<8 x bfloat> %in, <8 x bfloat> addrspace(1)* %out) { +; GCN-LABEL: test_arg_store_v8bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; GCN-NEXT: v_or_b32_e32 v3, v6, v2 +; GCN-NEXT: v_or_b32_e32 v2, v4, v5 +; GCN-NEXT: v_or_b32_e32 v1, v10, v7 +; GCN-NEXT: v_or_b32_e32 v0, v0, v11 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_arg_store_v8bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v0, v1 +; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[8:9], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_arg_store_v8bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_arg_store_v8bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_arg_store_v8bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + store <8 x bfloat> %in, <8 x bfloat> addrspace(1) * %out + ret void +} + +define void @test_arg_store_v16bf16(<16 x bfloat> %in, <16 x bfloat> addrspace(1)* %out) { +; GCN-LABEL: test_arg_store_v16bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v1 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_or_b32_e32 v3, v6, v7 +; GCN-NEXT: v_or_b32_e32 v2, v4, v5 +; GCN-NEXT: v_or_b32_e32 v1, v18, v15 +; GCN-NEXT: v_or_b32_e32 v0, v0, v19 +; GCN-NEXT: v_or_b32_e32 v7, v14, v20 +; GCN-NEXT: v_or_b32_e32 v6, v12, v13 +; GCN-NEXT: v_or_b32_e32 v5, v10, v11 +; GCN-NEXT: v_or_b32_e32 v4, v8, v9 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_arg_store_v16bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v18, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v18 +; GFX7-NEXT: v_or_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v15 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v14 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v13 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v13, v1, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v12, v7, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v11 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v10 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v9 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_or_b32_e32 v11, v1, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: v_or_b32_e32 v10, v7, v0 +; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[16:17], s[4:7], 0 addr64 offset:16 +; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[16:17], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_arg_store_v16bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v8 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_arg_store_v16bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_arg_store_v16bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16 +; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + store <16 x bfloat> %in, <16 x bfloat> addrspace(1) * %out + ret void +} + +define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, bfloat addrspace(1)* %out) { +; GCN-LABEL: test_inreg_arg_store: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s39, 0xf000 +; GCN-NEXT: s_mov_b32 s38, 0 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, s4 +; GCN-NEXT: s_mov_b32 s36, s38 +; GCN-NEXT: s_mov_b32 s37, s38 +; GCN-NEXT: buffer_store_short v2, v[0:1], s[36:39], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_inreg_arg_store: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, s4 +; GFX7-NEXT: s_mov_b32 s38, 0 +; GFX7-NEXT: s_mov_b32 s39, 0xf000 +; GFX7-NEXT: s_mov_b32 s36, s38 +; GFX7-NEXT: s_mov_b32 s37, s38 +; GFX7-NEXT: buffer_store_short v2, v[0:1], s[36:39], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_inreg_arg_store: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_inreg_arg_store: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_inreg_arg_store: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + store bfloat %in, bfloat addrspace(1) * %out + ret void +} + +define void @test_bitcast_from_bfloat(bfloat addrspace(1)* %in, i16 addrspace(1)* %out) { +; GCN-LABEL: test_bitcast_from_bfloat: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_bitcast_from_bfloat: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_bitcast_from_bfloat: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_short v[2:3], v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_bitcast_from_bfloat: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_short v[2:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_bitcast_from_bfloat: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_ushort v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_short v[2:3], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %val = load bfloat, bfloat addrspace(1) * %in + %val_int = bitcast bfloat %val to i16 + store i16 %val_int, i16 addrspace(1)* %out + ret void +} + +define void @test_bitcast_to_bfloat(bfloat addrspace(1)* %out, i16 addrspace(1)* %in) { +; GCN-LABEL: test_bitcast_to_bfloat: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_bitcast_to_bfloat: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_bitcast_to_bfloat: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_ushort v2, v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_bitcast_to_bfloat: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v2, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_bitcast_to_bfloat: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_ushort v2, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %val = load i16, i16 addrspace(1)* %in + %val_fp = bitcast i16 %val to bfloat + store bfloat %val_fp, bfloat addrspace(1)* %out + ret void +} + +define bfloat @test_ret(bfloat %in) { +; GCN-LABEL: test_ret: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_ret: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_ret: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_ret: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_ret: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +entry: + ret bfloat %in +} + +define bfloat @test_alloca_load_store_ret(bfloat %in) { +; GCN-LABEL: test_alloca_load_store_ret: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: buffer_store_short v0, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: test_alloca_load_store_ret: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: buffer_store_short v0, off, s[0:3], s32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_alloca_load_store_ret: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_store_short v0, off, s[0:3], s32 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_alloca_load_store_ret: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], s32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_alloca_load_store_ret: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_store_short v0, off, s[0:3], s32 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +entry: + %in.addr = alloca bfloat, align 2, addrspace(5) + store volatile bfloat %in, bfloat addrspace(5)* %in.addr, align 2 + %0 = load volatile bfloat, bfloat addrspace(5)* %in.addr, align 2 + ret bfloat %0 +} +