Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -105,7 +105,7 @@ static cl::opt EnableSDWAPeephole( "amdgpu-sdwa-peephole", cl::desc("Enable SDWA peepholer"), - cl::init(false)); + cl::init(true)); // Enable address space based alias analysis static cl::opt EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, Index: test/CodeGen/AMDGPU/add.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/add.v2i16.ll +++ test/CodeGen/AMDGPU/add.v2i16.ll @@ -1,11 +1,11 @@ ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_add_v2i16: ; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -53,7 +53,7 @@ ; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} ; VI: v_add_i32 -; VI: v_add_i32 +; VI: v_add_i32_sdwa define amdgpu_kernel void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 { %add = add <2 x i16> %a, %b store <2 x i16> %add, <2 x i16> addrspace(1)* %out @@ -257,7 +257,7 @@ ; GFX9: v_pk_add_u16 ; GFX9: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; VI: v_add_u16_e32 +; VI: v_add_u16_sdwa ; VI: v_add_u16_e32 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 Index: test/CodeGen/AMDGPU/ashr.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/ashr.v2i16.ll +++ test/CodeGen/AMDGPU/ashr.v2i16.ll @@ -8,11 +8,14 @@ ; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]] ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]] -; CIVI: v_ashrrev_i32_e32 -; CIVI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} -; CIVI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CIVI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; CIVI: v_or_b32_e32 +; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 + +; CI: v_ashrrev_i32_e32 +; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} +; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; CI: v_or_b32_e32 define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { %result = ashr <2 x i16> %lhs, %rhs store <2 x i16> %result, <2 x i16> addrspace(1)* %out @@ -24,10 +27,8 @@ ; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]] ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] -; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} ; VI: v_ashrrev_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_ashrrev_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; VI: v_ashrrev_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}} @@ -116,6 +117,14 @@ ; GCN: {{buffer|flat}}_load_dwordx2 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +; VI: v_ashrrev_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_ashrrev_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_ashrrev_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_ashrrev_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + ; GCN: {{buffer|flat}}_store_dwordx2 define amdgpu_kernel void @v_ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() Index: test/CodeGen/AMDGPU/ctlz.ll =================================================================== --- test/CodeGen/AMDGPU/ctlz.ll +++ test/CodeGen/AMDGPU/ctlz.ll @@ -98,7 +98,8 @@ ; FUNC-LABEL: {{^}}v_ctlz_i8: ; GCN: buffer_load_ubyte [[VAL:v[0-9]+]], -; GCN-DAG: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] +; SI-DAG: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] +; VI-DAG: v_ffbh_u32_sdwa [[RESULT:v[0-9]+]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GCN: buffer_store_byte [[RESULT]], ; GCN: s_endpgm define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { Index: test/CodeGen/AMDGPU/fabs.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fabs.f16.ll +++ test/CodeGen/AMDGPU/fabs.f16.ll @@ -36,13 +36,12 @@ ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]] ; CI: v_or_b32_e32 -; VI: flat_load_ushort [[LO:v[0-9]+]] ; VI: flat_load_ushort [[HI:v[0-9]+]] +; VI: flat_load_ushort [[LO:v[0-9]+]] ; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}} -; VI-DAG: v_and_b32_e32 [[FABS_LO:v[0-9]+]], [[MASK]], [[LO]] -; VI-DAG: v_and_b32_e32 [[FABS_HI:v[0-9]+]], [[MASK]], [[HI]] -; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, -; VI: v_or_b32 +; VI-DAG: v_and_b32_e32 [[FABS_LO:v[0-9]+]], [[MASK]], [[HI]] +; VI-DAG: v_and_b32_sdwa [[FABS_HI:v[0-9]+]], [[MASK]], [[LO]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, [[FABS_HI]], [[FABS_LO]] ; VI: flat_store_dword ; GFX9: s_load_dword [[VAL:s[0-9]+]] @@ -61,10 +60,12 @@ ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]] ; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}} -; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} -; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} -; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} -; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} +; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} +; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} +; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: flat_store_dwordx2 define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) { Index: test/CodeGen/AMDGPU/fadd.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fadd.f16.ll +++ test/CodeGen/AMDGPU/fadd.f16.ll @@ -74,13 +74,13 @@ ; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; VI-DAG: v_add_f16_e32 v[[R_F16_LO:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] +; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] -; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] -; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]] -; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fadd_v2f16( @@ -104,11 +104,14 @@ ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]] +; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]] +; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] -; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]] -; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]] -; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fadd_v2f16_imm_a( @@ -130,10 +133,14 @@ ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 1.0, v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 2.0, v[[A_V2_F16]] -; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 1.0, v[[A_F16_1]] -; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[A_F16_1]] +; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[A_V2_F16]] +; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_1]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fadd_v2f16_imm_b( Index: test/CodeGen/AMDGPU/fcanonicalize.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -261,8 +261,8 @@ ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_v2f16: ; VI: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, v{{[0-9]+}} ; VI-DAG: v_lshrrev_b32_e32 [[FNEG_HI:v[0-9]+]], 16, [[FNEG]] -; VI-DAG: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, [[FNEG]] ; VI-DAG: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, [[FNEG_HI]] +; VI-DAG: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, [[FNEG]] ; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, ; VI-NOT: 0xffff Index: test/CodeGen/AMDGPU/fmul.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fmul.f16.ll +++ test/CodeGen/AMDGPU/fmul.f16.ll @@ -74,13 +74,13 @@ ; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; VI-DAG: v_mul_f16_e32 v[[R_F16_LO:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] +; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] -; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] -; VI-DAG: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]] -; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fmul_v2f16( @@ -104,8 +104,8 @@ ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] ; VI-DAG: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]] +; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] ; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] @@ -129,8 +129,8 @@ ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] ; VI-DAG: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]] +; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] ; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] Index: test/CodeGen/AMDGPU/fneg.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fneg.f16.ll +++ test/CodeGen/AMDGPU/fneg.f16.ll @@ -69,10 +69,11 @@ ; CI: v_or_b32_e32 ; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x8000{{$}} -; VI: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] -; VI: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] +; VI-DAG: v_xor_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] ; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}} + define amdgpu_kernel void @s_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 { %fneg = fsub <2 x half> , %in store <2 x half> %fneg, <2 x half> addrspace(1)* %out Index: test/CodeGen/AMDGPU/fpext.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fpext.f16.ll +++ test/CodeGen/AMDGPU/fpext.f16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s +; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI -check-prefix=SIGFX9 %s ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=SIGFX9 %s ; GCN-LABEL: {{^}}fpext_f16_to_f32 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] @@ -35,12 +35,14 @@ ; GCN-LABEL: {{^}}fpext_v2f16_to_v2f32 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GFX89-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; GFX9-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; GCN-DAG: v_cvt_f32_f16_e32 v[[R_F32_0:[0-9]+]], v[[A_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; GCN: v_cvt_f32_f16_e32 v[[R_F32_1:[0-9]+]], v[[A_F16_1]] +; SIGFX9: v_cvt_f32_f16_e32 v[[R_F32_1:[0-9]+]], v[[A_F16_1]] +; VI: v_cvt_f32_f16_sdwa v[[R_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_F32_0]]:[[R_F32_1]]{{\]}} ; GCN: s_endpgm + define amdgpu_kernel void @fpext_v2f16_to_v2f32( <2 x float> addrspace(1)* %r, <2 x half> addrspace(1)* %a) #0 { @@ -53,14 +55,16 @@ ; GCN-LABEL: {{^}}fpext_v2f16_to_v2f64 ; GCN: buffer_load_dword -; GCN-DAG: v_lshrrev_b32_e32 -; GCN-DAG: v_cvt_f32_f16_e32 +; SIGFX9-DAG: v_lshrrev_b32_e32 +; SIGFX9-DAG: v_cvt_f32_f16_e32 +; VI: v_cvt_f32_f16_sdwa ; GCN: v_cvt_f32_f16_e32 ; GCN: v_cvt_f64_f32_e32 ; GCN: v_cvt_f64_f32_e32 ; GCN: buffer_store_dwordx4 ; GCN: s_endpgm + define amdgpu_kernel void @fpext_v2f16_to_v2f64( <2 x double> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { Index: test/CodeGen/AMDGPU/fptosi.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fptosi.f16.ll +++ test/CodeGen/AMDGPU/fptosi.f16.ll @@ -52,16 +52,25 @@ ; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i16 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; GCN: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; GCN: v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]] -; GCN: v_cvt_i32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]] -; GCN: v_and_b32_e32 v[[R_I16_LO:[0-9]+]], 0xffff, v[[R_I16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]] -; GCN: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_LO]] + +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]] +; SI: v_cvt_i32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]] +; SI: v_and_b32_e32 v[[R_I16_LO:[0-9]+]], 0xffff, v[[R_I16_0]] +; SI: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]] +; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_LO]] + +; VI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; VI: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI: v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]] +; VI: v_cvt_i32_f32_sdwa v[[R_I16_1:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI: v_or_b32_sdwa v[[R_V2_I16:[0-9]+]], v[[R_I16_1]], v[[R_I16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 + ; GCN: buffer_store_dword v[[R_V2_I16]] ; GCN: s_endpgm + define amdgpu_kernel void @fptosi_v2f16_to_v2i16( <2 x i16> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { @@ -75,7 +84,8 @@ ; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i32 ; GCN: buffer_load_dword ; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; VI: v_cvt_f32_f16_sdwa ; GCN: v_cvt_i32_f32_e32 ; GCN: v_cvt_i32_f32_e32 ; GCN: buffer_store_dwordx2 @@ -96,7 +106,8 @@ ; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i64 ; GCN: buffer_load_dword ; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; VI: v_cvt_f32_f16_sdwa ; GCN: s_endpgm define amdgpu_kernel void @fptosi_v2f16_to_v2i64( <2 x i64> addrspace(1)* %r, Index: test/CodeGen/AMDGPU/fptoui.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fptoui.f16.ll +++ test/CodeGen/AMDGPU/fptoui.f16.ll @@ -53,17 +53,24 @@ ; GCN-LABEL: {{^}}fptoui_v2f16_to_v2i16 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; GCN-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; GCN-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] + +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; SI: v_cvt_u32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]] ; SI: v_cvt_u32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]] -; VI: v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]] +; SI: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]] +; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_0]] + +; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] +; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI: v_cvt_i32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]] -; GCN: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]] -; GCN: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_0]] +; VI: v_cvt_i32_f32_sdwa v[[R_I16_0:[0-9]+]], v[[A_F32_0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI: v_or_b32_sdwa v[[R_V2_I16:[0-9]+]], v[[R_I16_0]], v[[R_I16_1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 + ; GCN: buffer_store_dword v[[R_V2_I16]] ; GCN: s_endpgm + define amdgpu_kernel void @fptoui_v2f16_to_v2i16( <2 x i16> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { @@ -77,7 +84,8 @@ ; GCN-LABEL: {{^}}fptoui_v2f16_to_v2i32 ; GCN: buffer_load_dword ; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; VI: v_cvt_f32_f16_sdwa ; GCN: v_cvt_u32_f32_e32 ; GCN: v_cvt_u32_f32_e32 ; GCN: buffer_store_dwordx2 @@ -98,7 +106,8 @@ ; GCN-LABEL: {{^}}fptoui_v2f16_to_v2i64 ; GCN: buffer_load_dword ; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; VI: v_cvt_f32_f16_sdwa ; GCN: s_endpgm define amdgpu_kernel void @fptoui_v2f16_to_v2i64( <2 x i64> addrspace(1)* %r, Index: test/CodeGen/AMDGPU/fptrunc.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fptrunc.f16.ll +++ test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI %s +; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SIVI %s ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global,+fp64-fp16-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX9-DENORM %s ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global,-fp64-fp16-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX9-FLUSH %s @@ -37,10 +37,14 @@ ; GCN-LABEL: {{^}}fptrunc_v2f32_to_v2f16: ; GCN: buffer_load_dwordx2 v{{\[}}[[A_F32_0:[0-9]+]]:[[A_F32_1:[0-9]+]]{{\]}} ; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]] -; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] -; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] + +; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] ; GFX9-FLUSH: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] ; GFX9-FLUSH: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]] @@ -48,6 +52,7 @@ ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm + define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( <2 x half> addrspace(1)* %r, <2 x float> addrspace(1)* %a) { @@ -63,17 +68,19 @@ ; GCN-DAG: v_cvt_f32_f64_e32 v[[A_F32_0:[0-9]+]], v{{\[}}[[A_F64_0]]:{{[0-9]+}}{{\]}} ; GCN-DAG: v_cvt_f32_f64_e32 v[[A_F32_1:[0-9]+]], v{{\[}}{{[0-9]+}}:[[A_F64_3]]{{\]}} ; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]] -; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] -; SIVI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; VI: v_cvt_f16_f32_sdwa v[[R_F16_HI:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD + ; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] ; GFX9-FLUSH: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] ; GFX9-FLUSH: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]] ; GFX9-DENORM: v_pack_b32_f16 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; GCN: buffer_store_dword v[[R_V2_F16]] + define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( <2 x half> addrspace(1)* %r, <2 x double> addrspace(1)* %a) { Index: test/CodeGen/AMDGPU/fsub.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fsub.f16.ll +++ test/CodeGen/AMDGPU/fsub.f16.ll @@ -74,21 +74,18 @@ ; SI: v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] - - -; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; VI-DAG: v_subrev_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] -; VI-DAG: v_subrev_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]] - -; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; VI-DAG: v_subrev_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm + define amdgpu_kernel void @fsub_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -103,26 +100,29 @@ ; GCN-LABEL: {{^}}fsub_v2f16_imm_a: ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SIVI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] - -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] - -; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]] -; VI-DAG: v_sub_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]] -; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; VI-DAG: v_sub_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]] +; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]] +; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x40003c00 ; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], [[K]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm + define amdgpu_kernel void @fsub_v2f16_imm_a( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %b) { @@ -135,25 +135,29 @@ ; GCN-LABEL: {{^}}fsub_v2f16_imm_b: ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SIVI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] - -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], -2.0, v[[A_F32_0]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], -1.0, v[[A_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], -2.0, v[[A_V2_F16]] -; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], -1.0, v[[A_F16_1]] -; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], -2.0, v[[A_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], -1.0, v[[A_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], -1.0, v[[A_F16_1]] +; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], -2.0, v[[A_V2_F16]] +; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xbc00c000 ; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], [[K]], v[[A_V2_F16]]{{$}} ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm + define amdgpu_kernel void @fsub_v2f16_imm_b( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { Index: test/CodeGen/AMDGPU/half.ll =================================================================== --- test/CodeGen/AMDGPU/half.ll +++ test/CodeGen/AMDGPU/half.ll @@ -283,10 +283,10 @@ ; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32: ; GCN: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; VI: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] ; GCN: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]] -; SI: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] -; GCN: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]] +; SI: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] +; SI: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]] +; VI: v_cvt_f32_f16_sdwa v[[CVT1:[0-9]+]], [[LOAD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GCN: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}} ; GCN: s_endpgm define amdgpu_kernel void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { @@ -324,22 +324,26 @@ ; GCN: buffer_load_dwordx4 ; GCN: buffer_load_dwordx4 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 + +; VI: v_cvt_f32_f16_e32 +; VI: v_cvt_f32_f16_sdwa +; ... ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 @@ -368,11 +372,18 @@ ; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f64: ; GCN-DAG: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] -; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]] -; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]] -; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT0]] -; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]] + +; SI-DAG: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] +; SI-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]] +; SI-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]] +; SI-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT0]] +; SI-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]] + +; VI-DAG: v_cvt_f32_f16_sdwa v[[CVT0:[0-9]+]], [[LOAD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[LOAD]] +; VI-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT0]] +; VI-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT1]] + ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[CVT2_LO]]:[[CVT3_HI]]{{\]}} ; GCN: s_endpgm define amdgpu_kernel void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { @@ -392,18 +403,17 @@ ; XSI-NOT: v_cvt_f32_f16 ; XVI: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] -; XVI-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}} -; XVI: v_cvt_f32_f16_e32 ; XVI: v_cvt_f32_f16_e32 ; XVI: v_cvt_f32_f16_e32 +; XVI: v_cvt_f32_f16_sdwa ; XVI-NOT: v_cvt_f32_f16 ; GCN: buffer_load_dwordx2 v{{\[}}[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]] -; VI-DAG: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]] ; GCN-DAG: v_cvt_f32_f16_e32 [[Z32:v[0-9]+]], v[[IN_HI]] ; GCN-DAG: v_cvt_f32_f16_e32 [[X32:v[0-9]+]], v[[IN_LO]] ; SI: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]] -; GCN-DAG: v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]] +; SI-DAG: v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]] +; VI-DAG: v_cvt_f32_f16_sdwa [[Y32:v[0-9]+]], v[[IN_LO]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GCN-DAG: v_cvt_f64_f32_e32 [[Z:v\[[0-9]+:[0-9]+\]]], [[Z32]] ; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[XLO:[0-9]+]]:{{[0-9]+}}], [[X32]] @@ -458,9 +468,14 @@ ; GCN-LABEL: {{^}}global_truncstore_v2f32_to_v2f16: ; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} ; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]] -; GCN-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]] -; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]] -; GCN-DAG: v_or_b32_e32 [[PACKED:v[0-9]+]], [[SHL]], [[CVT0]] + +; SI-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]] +; SI-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]] +; SI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[SHL]], [[CVT0]] + +; VI-DAG: v_cvt_f16_f32_sdwa [[CVT1:v[0-9]+]], v[[HI]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT1]], [[CVT0]] + ; GCN-DAG: buffer_store_dword [[PACKED]] ; GCN: s_endpgm define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { @@ -472,10 +487,10 @@ ; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16: ; GCN: buffer_load_dwordx4 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN-NOT: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; SI-DAG: v_cvt_f16_f32_e32 +; VI-DAG: v_cvt_f16_f32_sdwa +; GCN-DAG: v_cvt_f16_f32_e32 ; GCN: buffer_store_short ; GCN: buffer_store_dword ; GCN: s_endpgm @@ -488,10 +503,12 @@ ; GCN-LABEL: {{^}}global_truncstore_v4f32_to_v4f16: ; GCN: buffer_load_dwordx4 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; SI-DAG: v_cvt_f16_f32_e32 +; SI-DAG: v_cvt_f16_f32_e32 +; VI-DAG: v_cvt_f16_f32_sdwa +; VI-DAG: v_cvt_f16_f32_sdwa +; GCN-DAG: v_cvt_f16_f32_e32 ; GCN: buffer_store_dwordx2 ; GCN: s_endpgm define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { @@ -504,14 +521,22 @@ ; GCN-LABEL: {{^}}global_truncstore_v8f32_to_v8f16: ; GCN: buffer_load_dwordx4 ; GCN: buffer_load_dwordx4 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 +; SI: v_cvt_f16_f32_e32 +; SI: v_cvt_f16_f32_e32 +; SI: v_cvt_f16_f32_e32 +; SI: v_cvt_f16_f32_e32 +; SI: v_cvt_f16_f32_e32 +; SI: v_cvt_f16_f32_e32 +; SI: v_cvt_f16_f32_e32 +; SI: v_cvt_f16_f32_e32 +; VI-DAG: v_cvt_f16_f32_e32 +; VI-DAG: v_cvt_f16_f32_e32 +; VI-DAG: v_cvt_f16_f32_e32 +; VI-DAG: v_cvt_f16_f32_e32 +; VI-DAG: v_cvt_f16_f32_sdwa +; VI-DAG: v_cvt_f16_f32_sdwa +; VI-DAG: v_cvt_f16_f32_sdwa +; VI-DAG: v_cvt_f16_f32_sdwa ; GCN: buffer_store_dwordx4 ; GCN: s_endpgm define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 { Index: test/CodeGen/AMDGPU/immv216.ll =================================================================== --- test/CodeGen/AMDGPU/immv216.ll +++ test/CodeGen/AMDGPU/immv216.ll @@ -296,10 +296,9 @@ ; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}} ; VI-DAG: buffer_load_dword ; VI-NOT: and -; VI: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}} ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}} -; VI: v_or_b32 +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[K]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: buffer_store_dword define amdgpu_kernel void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %x = load <2 x half>, <2 x half> addrspace(1)* %in Index: test/CodeGen/AMDGPU/llvm.ceil.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.ceil.f16.ll +++ test/CodeGen/AMDGPU/llvm.ceil.f16.ll @@ -25,17 +25,21 @@ ; GCN-LABEL: {{^}}ceil_v2f16: ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_ceil_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_ceil_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-NOT: and +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + ; VI-DAG: v_ceil_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] -; VI-DAG: v_ceil_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]] -; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; VI-DAG: v_ceil_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NOT: and +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @ceil_v2f16( Index: test/CodeGen/AMDGPU/llvm.cos.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -32,12 +32,10 @@ ; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[HALF_PIE]], v[[A_F32_0]] ; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[HALF_PIE]], v[[A_F32_1]] - -; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; VI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], 0.15915494, v[[A_F32_0]] -; VI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]] +; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], 0.15915494, v[[A_F32_0]] +; VI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]] ; GCN-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]] ; GCN-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]] @@ -45,11 +43,13 @@ ; GCN-DAG: v_cos_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]] ; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[R_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GCN-NOT: and -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @cos_v2f16( Index: test/CodeGen/AMDGPU/llvm.dbg.value.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.dbg.value.ll +++ test/CodeGen/AMDGPU/llvm.dbg.value.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs -mattr=-flat-for-global < %s | FileCheck %s +; RUN: llc -O0 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -amdgpu-sdwa-peephole=0 -verify-machineinstrs -mattr=-flat-for-global < %s | FileCheck %s ; CHECK-LABEL: {{^}}test_debug_value: ; CHECK: s_load_dwordx2 s[4:5] Index: test/CodeGen/AMDGPU/llvm.exp2.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.exp2.f16.ll +++ test/CodeGen/AMDGPU/llvm.exp2.f16.ll @@ -25,17 +25,21 @@ ; GCN-LABEL: {{^}}exp2_v2f16 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_exp_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_exp_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI-DAG: v_exp_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] -; VI-DAG: v_exp_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]] -; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-NOT: and +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; VI-DAG: v_exp_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] +; VI-DAG: v_exp_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NOT: and +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @exp2_v2f16( Index: test/CodeGen/AMDGPU/llvm.floor.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.floor.f16.ll +++ test/CodeGen/AMDGPU/llvm.floor.f16.ll @@ -25,17 +25,21 @@ ; GCN-LABEL: {{^}}floor_v2f16 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_floor_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_floor_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI-DAG: v_floor_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] -; VI-DAG: v_floor_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]] -; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-NOT: and +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; VI-DAG: v_floor_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] +; VI-DAG: v_floor_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NOT: and +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @floor_v2f16( Index: test/CodeGen/AMDGPU/llvm.fma.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.fma.f16.ll +++ test/CodeGen/AMDGPU/llvm.fma.f16.ll @@ -145,11 +145,8 @@ } ; GCN-LABEL: {{^}}fma_v2f16_imm_a: -; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]] - -; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]] -; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]] ; SI: v_mov_b32_e32 v[[A_F32:[0-9]+]], 0x40400000{{$}} ; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}} @@ -165,11 +162,10 @@ ; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32]], v[[C_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16]], v[[C_F16_1]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; VI: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_F16]], v[[C_V2_F16]] - +; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[C_F16_1]], v[[A_F16]], v[[B_F16_1]] +; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[C_V2_F16]], v[[A_F16]], v[[B_V2_F16]] +; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN-NOT: and ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] @@ -196,9 +192,9 @@ ; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}} ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] @@ -207,11 +203,10 @@ ; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32]], v[[C_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] -; VI_DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16]], v[[C_F16_1]] -; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_F16]], v[[C_V2_F16]] - +; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] +; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_F16]], v[[C_V2_F16]] +; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16]], v[[C_F16_1]] ; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN-NOT: and @@ -240,9 +235,9 @@ ; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}} ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] @@ -250,13 +245,12 @@ ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_F16]] -; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16]] - -; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_F16]] +; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16]] ; GCN-NOT: and ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] Index: test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -115,19 +115,18 @@ ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]] ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] -; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; VI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] - - -; VI-FLUSH: v_mac_f16_e32 v[[C_F16_1]], v[[B_F16_1]], v[[A_F16_1]] -; VI-FLUSH: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]] -; VI-FLUSH: v_mac_f16_e32 v[[C_V2_F16]], v[[B_V2_F16]], v[[A_V2_F16]] +; VI-FLUSH: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; VI-FLUSH-DAG: v_mac_f16_sdwa v[[A_F16_1]], v[[C_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-FLUSH-DAG: v_mac_f16_e32 v[[A_V2_F16]], v[[C_V2_F16]], v[[B_V2_F16]] +; VI-FLUSH-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[A_F16_1]] ; VI-FLUSH-NOT: v_and_b32 -; VI-FLUSH: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[C_V2_F16]] +; VI-FLUSH: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[A_V2_F16]] +; VI-DENORM: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; VI-DENORM: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; VI-DENORM: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] ; VI-DENORM-DAG: v_fma_f16 v[[RES0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]] ; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]] ; VI-DENORM-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[RES1]] @@ -136,6 +135,7 @@ ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm + define amdgpu_kernel void @fmuladd_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, Index: test/CodeGen/AMDGPU/llvm.log2.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.log2.f16.ll +++ test/CodeGen/AMDGPU/llvm.log2.f16.ll @@ -24,18 +24,22 @@ ; GCN-LABEL: {{^}}log2_v2f16 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI-DAG: v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] ; SI-DAG: v_log_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-NOT: and +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + ; VI-DAG: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] -; VI-DAG: v_log_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]] -; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; VI-DAG: v_log_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NOT: and +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @log2_v2f16( Index: test/CodeGen/AMDGPU/llvm.maxnum.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -74,18 +74,18 @@ ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] ; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] -; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-NOT: and +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] -; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] -; VI-DAG: v_max_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]] +; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] +; VI-DAG: v_max_f16_sdwa v[[R_F16_1:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NOT: and +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] -; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @maxnum_v2f16( @@ -109,12 +109,12 @@ ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] ; VI-DAG: v_max_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]] +; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] ; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @maxnum_v2f16_imm_a( @@ -136,8 +136,8 @@ ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] ; VI-DAG: v_max_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]] +; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] ; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN-NOT: and Index: test/CodeGen/AMDGPU/llvm.minnum.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -67,24 +67,24 @@ ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] ; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] -; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-NOT: and +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] -; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] -; VI-DAG: v_min_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]] +; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] +; VI-DAG: v_min_f16_sdwa v[[R_F16_1:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NOT: and +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] -; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @minnum_v2f16( @@ -103,16 +103,16 @@ ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] ; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]] ; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] +; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; VI-DAG: v_min_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]] +; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] ; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN-NOT: and @@ -138,8 +138,8 @@ ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI_DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] ; VI-DAG: v_min_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]] +; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] ; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN-NOT: and Index: test/CodeGen/AMDGPU/llvm.rint.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.rint.f16.ll +++ test/CodeGen/AMDGPU/llvm.rint.f16.ll @@ -26,19 +26,20 @@ ; GCN-LABEL: {{^}}rint_v2f16 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_rndne_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_rndne_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-NOT: v_and_b32 +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] -; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; VI-DAG: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] -; VI-DAG: v_rndne_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]] -; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; VI-DAG: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] +; VI-DAG: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NOT: v_and_b32 -; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] ; GFX9: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] ; GFX9: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] Index: test/CodeGen/AMDGPU/llvm.sin.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -26,29 +26,32 @@ ; GCN-LABEL: {{^}}sin_v2f16 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: v_mov_b32_e32 v[[HALF_PIE:[0-9]+]], 0x3e22f983{{$}} -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[HALF_PIE]], v[[A_F32_0]] +; SI-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]] +; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[HALF_PIE]], v[[A_F32_1]] +; SI-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]] -; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] - - -; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[HALF_PIE]], v[[A_F32_0]] -; VI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], 0.15915494, v[[A_F32_0]] -; GCN-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]] - -; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[HALF_PIE]], v[[A_F32_1]] -; VI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]] -; GCN-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]] +; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], 0.15915494, v[[A_F32_0]] +; VI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]] +; VI-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]] +; VI-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]] ; GCN-DAG: v_sin_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]] ; GCN-DAG: v_sin_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]] ; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[R_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @sin_v2f16( Index: test/CodeGen/AMDGPU/llvm.sqrt.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.sqrt.f16.ll +++ test/CodeGen/AMDGPU/llvm.sqrt.f16.ll @@ -25,17 +25,21 @@ ; GCN-LABEL: {{^}}sqrt_v2f16 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_sqrt_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_sqrt_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI-DAG: v_sqrt_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] -; VI-DAG: v_sqrt_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]] -; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN-NOT: v_and_b32 -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-NOT: v_and_b32 +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; VI-DAG: v_sqrt_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] +; VI-DAG: v_sqrt_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NOT: v_and_b32 +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @sqrt_v2f16( Index: test/CodeGen/AMDGPU/llvm.trunc.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.trunc.f16.ll +++ test/CodeGen/AMDGPU/llvm.trunc.f16.ll @@ -25,17 +25,21 @@ ; GCN-LABEL: {{^}}trunc_v2f16 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_trunc_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_trunc_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI-DAG: v_trunc_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] -; VI-DAG: v_trunc_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]] -; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN-NOT: v_and_b32 -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-NOT: v_and_b32 +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] + +; VI-DAG: v_trunc_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] +; VI-DAG: v_trunc_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NOT: v_and_b32 +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @trunc_v2f16( Index: test/CodeGen/AMDGPU/lshr.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/lshr.v2i16.ll +++ test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -8,8 +8,9 @@ ; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]] ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]] -; CIVI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CIVI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; VI: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} ; CIVI: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16 ; CIVI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { @@ -23,10 +24,8 @@ ; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]] ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] -; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; VI: v_lshrrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_lshrrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; VI: v_lshrrev_b16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}} Index: test/CodeGen/AMDGPU/max.i16.ll =================================================================== --- test/CodeGen/AMDGPU/max.i16.ll +++ test/CodeGen/AMDGPU/max.i16.ll @@ -20,7 +20,7 @@ ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_imax_sge_v2i16: ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_max_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_test_imax_sge_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %aptr, <2 x i16> addrspace(1)* %bptr) nounwind { @@ -39,7 +39,7 @@ ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_imax_sge_v3i16: ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_max_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI-NOT: v_max_i16 @@ -61,9 +61,9 @@ ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_imax_sge_v4i16: ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_max_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_max_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} @@ -130,7 +130,7 @@ ; GCN-LABEL: {{^}}v_test_umax_ugt_v2i16: ; VI: v_max_u16_e32 -; VI: v_max_u16_e32 +; VI: v_max_u16_sdwa ; GFX9: v_pk_max_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_test_umax_ugt_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %aptr, <2 x i16> addrspace(1)* %bptr) nounwind { Index: test/CodeGen/AMDGPU/min.ll =================================================================== --- test/CodeGen/AMDGPU/min.ll +++ test/CodeGen/AMDGPU/min.ll @@ -289,7 +289,7 @@ ; SI-NOT: v_min_u32_e32 ; VI: v_min_u16_e32 -; VI: v_min_u16_e32 +; VI: v_min_u16_sdwa ; VI: v_min_u16_e32 ; VI-NOT: v_min_u16_e32 Index: test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll =================================================================== --- test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll +++ test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll @@ -1,4 +1,7 @@ -; RUN: llc -O0 -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=VGPR -check-prefix=GCN %s +; RUN: llc -O0 -march=amdgcn -mcpu=hawaii -amdgpu-sdwa-peephole=0 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=VGPR -check-prefix=GCN %s + +; FIXME: we should disable sdwa peephole because dead-code elimination, that +; runs after peephole, ruins this test (different register numbers) ; Spill all SGPRs so multiple VGPRs are required for spilling all of them. Index: test/CodeGen/AMDGPU/sdwa-peephole.ll =================================================================== --- test/CodeGen/AMDGPU/sdwa-peephole.ll +++ test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=fiji --amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SDWA -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SDWA -check-prefix=GCN %s ; GCN-LABEL: {{^}}add_shr_i32: ; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}} @@ -72,8 +72,8 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) { @@ -93,12 +93,12 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL3]], v[[DST_MUL2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL1]], v[[DST_MUL0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL3]], v[[DST_MUL2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL1]], v[[DST_MUL0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 define amdgpu_kernel void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) { entry: @@ -117,18 +117,18 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL4:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL5:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL6:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL7:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL7]], v[[DST_MUL6]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL5]], v[[DST_MUL4]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL3]], v[[DST_MUL2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL1]], v[[DST_MUL0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL4:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL5:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL6:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL7:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL7]], v[[DST_MUL6]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL5]], v[[DST_MUL4]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL3]], v[[DST_MUL2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL1]], v[[DST_MUL0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 define amdgpu_kernel void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) { entry: @@ -182,10 +182,10 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_f16_sdwa -; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) { entry: @@ -204,14 +204,14 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_f16_sdwa -; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) { entry: @@ -264,9 +264,9 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA: v_mul_u32_u24_sdwa -; SDWA: v_mul_u32_u24_sdwa -; SDWA: v_mul_u32_u24_sdwa +; SDWA-DAG: v_mul_u32_u24_sdwa +; SDWA-DAG: v_mul_u32_u24_sdwa +; SDWA-DAG: v_mul_u32_u24_sdwa define amdgpu_kernel void @mul_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %ina, <4 x i8> addrspace(1)* %inb) { entry: @@ -285,12 +285,12 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA: v_mul_u32_u24_sdwa -; SDWA: v_mul_u32_u24_sdwa -; SDWA: v_mul_u32_u24_sdwa -; SDWA: v_mul_u32_u24_sdwa -; SDWA: v_mul_u32_u24_sdwa -; SDWA: v_mul_u32_u24_sdwa +; SDWA-DAG: v_mul_u32_u24_sdwa +; SDWA-DAG: v_mul_u32_u24_sdwa +; SDWA-DAG: v_mul_u32_u24_sdwa +; SDWA-DAG: v_mul_u32_u24_sdwa +; SDWA-DAG: v_mul_u32_u24_sdwa +; SDWA-DAG: v_mul_u32_u24_sdwa define amdgpu_kernel void @mul_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %ina, <8 x i8> addrspace(1)* %inb) { entry: @@ -301,6 +301,26 @@ ret void } +; GCN-LABEL: {{^}}sitofp_v2i16_to_v2f16: +; NOSDWA-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 +; NOSDWA-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; NOSDWA-DAG: v_cvt_f32_i32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-DAG: v_cvt_f32_i32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-NOT: v_cvt_f32_i32_sdwa + +; SDWA-DAG: v_cvt_f32_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; SDWA-DAG: v_cvt_f32_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 + +define amdgpu_kernel void @sitofp_v2i16_to_v2f16( + <2 x half> addrspace(1)* %r, + <2 x i16> addrspace(1)* %a) { +entry: + %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a + %r.val = sitofp <2 x i16> %a.val to <2 x half> + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} + ; GCN-LABEL: {{^}}mac_v2half: ; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} Index: test/CodeGen/AMDGPU/select.f16.ll =================================================================== --- test/CodeGen/AMDGPU/select.f16.ll +++ test/CodeGen/AMDGPU/select.f16.ll @@ -155,18 +155,22 @@ } ; GCN-LABEL: {{^}}select_v2f16: -; SI: v_cvt_f32_f16_e32 -; SI: v_cvt_f32_f16_e32 -; SI: v_cvt_f32_f16_e32 -; SI: v_cvt_f32_f16_e32 -; SI: v_cmp_lt_f32_e64 -; SI: v_cmp_lt_f32_e32 -; VI: v_cmp_lt_f16_e32 -; VI: v_cmp_lt_f16_e64 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e64 -; SI: v_cvt_f16_f32_e32 -; SI: v_cvt_f16_f32_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_cmp_lt_f32_e64 +; SI: v_cmp_lt_f32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e64 +; SI: v_cvt_f16_f32_e32 +; SI: v_cvt_f16_f32_e32 + +; VI: v_cmp_lt_f16_e64 +; VI: v_cmp_lt_f16_e32 +; VI: v_cndmask_b32_e64 +; VI: v_cndmask_b32_e32 + ; GCN: s_endpgm define amdgpu_kernel void @select_v2f16( <2 x half> addrspace(1)* %r, Index: test/CodeGen/AMDGPU/shl.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/shl.v2i16.ll +++ test/CodeGen/AMDGPU/shl.v2i16.ll @@ -8,11 +8,15 @@ ; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]] ; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]] -; CIVI: v_lshlrev_b32_e32 -; CIVI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} -; CIVI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CIVI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; CIVI: v_or_b32_e32 +; VI: v_lshlrev_b32_e32 +; VI: v_lshlrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 + +; CI: v_lshlrev_b32_e32 +; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} +; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; CI: v_or_b32_e32 define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { %result = shl <2 x i16> %lhs, %rhs store <2 x i16> %result, <2 x i16> addrspace(1)* %out @@ -24,10 +28,8 @@ ; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]] ; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] -; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; VI: v_lshlrev_b16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}} Index: test/CodeGen/AMDGPU/sitofp.f16.ll =================================================================== --- test/CodeGen/AMDGPU/sitofp.f16.ll +++ test/CodeGen/AMDGPU/sitofp.f16.ll @@ -37,14 +37,23 @@ ; GCN-LABEL: {{^}}sitofp_v2i16_to_v2f16 ; GCN: buffer_load_dword -; GCN: v_cvt_f32_i32_e32 -; GCN: v_cvt_f32_i32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN-DAG: v_lshlrev_b32_e32 -; GCN-DAG: v_or_b32_e32 -; GCN: buffer_store_dword -; GCN: s_endpgm + +; SI: v_cvt_f32_i32_e32 +; SI: v_cvt_f32_i32_e32 +; SI: v_cvt_f16_f32_e32 +; SI: v_cvt_f16_f32_e32 +; SI-DAG: v_lshlrev_b32_e32 +; SI: v_or_b32_e32 + +; VI-DAG: v_cvt_f32_i32_sdwa +; VI-DAG: v_cvt_f32_i32_sdwa +; VI-DAG: v_cvt_f16_f32_e32 +; VI-DAG: v_cvt_f16_f32_sdwa +; VI: v_or_b32_e32 + +; GCN: buffer_store_dword +; GCN: s_endpgm + define amdgpu_kernel void @sitofp_v2i16_to_v2f16( <2 x half> addrspace(1)* %r, <2 x i16> addrspace(1)* %a) { @@ -56,15 +65,23 @@ } ; GCN-LABEL: {{^}}sitofp_v2i32_to_v2f16 -; GCN: buffer_load_dwordx2 -; GCN: v_cvt_f32_i32_e32 -; GCN: v_cvt_f32_i32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN-DAG: v_lshlrev_b32_e32 -; GCN-DAG: v_or_b32_e32 -; GCN: buffer_store_dword -; GCN: s_endpgm +; GCN: buffer_load_dwordx2 + +; SI: v_cvt_f32_i32_e32 +; SI: v_cvt_f32_i32_e32 +; SI: v_cvt_f16_f32_e32 +; SI: v_cvt_f16_f32_e32 +; SI-DAG: v_lshlrev_b32_e32 +; SI: v_or_b32_e32 + +; VI-DAG: v_cvt_f32_i32_e32 +; VI-DAG: v_cvt_f32_i32_e32 +; VI-DAG: v_cvt_f16_f32_e32 +; VI-DAG: v_cvt_f16_f32_sdwa +; VI: v_or_b32_e32 + +; GCN: buffer_store_dword +; GCN: s_endpgm define amdgpu_kernel void @sitofp_v2i32_to_v2f16( <2 x half> addrspace(1)* %r, <2 x i32> addrspace(1)* %a) { Index: test/CodeGen/AMDGPU/sminmax.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -8,15 +8,23 @@ ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]] ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 -; CIVI: v_sub_i32_e32 -; CIVI-DAG: v_sub_i32_e32 -; CIVI: v_bfe_i32 -; CIVI-DAG: v_bfe_i32 -; CIVI-DAG: v_add_i32_e32 -; CIVI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16 -; CIVI: v_add_i32_e32 -; CIVI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, -; CIVI: v_or_b32_e32 +; VI: v_sub_i32_e32 +; VI-DAG: v_sub_i32_e32 +; VI: v_max_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI: v_max_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI: v_add_i32_e32 +; VI: v_add_i32_e32 +; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 + +; CI: v_sub_i32_e32 +; CI-DAG: v_sub_i32_e32 +; CI: v_bfe_i32 +; CI-DAG: v_bfe_i32 +; CI-DAG: v_add_i32_e32 +; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16 +; CI: v_add_i32_e32 +; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, +; CI: v_or_b32_e32 define amdgpu_kernel void @s_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 { %neg = sub <2 x i16> zeroinitializer, %val %cond = icmp sgt <2 x i16> %val, %neg Index: test/CodeGen/AMDGPU/sra.ll =================================================================== --- test/CodeGen/AMDGPU/sra.ll +++ test/CodeGen/AMDGPU/sra.ll @@ -49,8 +49,8 @@ ; FUNC-LABEL: {{^}}ashr_v2i16: ; FIXME: The ashr operation is uniform, but because its operands come from a ; global load we end up with the vector instructions rather than scalar. -; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 define amdgpu_kernel void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1 %a = load <2 x i16>, <2 x i16> addrspace(1)* %in @@ -63,10 +63,10 @@ ; FUNC-LABEL: {{^}}ashr_v4i16: ; FIXME: The ashr operation is uniform, but because its operands come from a ; global load we end up with the vector instructions rather than scalar. -; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 define amdgpu_kernel void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1 %a = load <4 x i16>, <4 x i16> addrspace(1)* %in Index: test/CodeGen/AMDGPU/sub.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/sub.v2i16.ll +++ test/CodeGen/AMDGPU/sub.v2i16.ll @@ -5,7 +5,7 @@ ; GCN-LABEL: {{^}}v_test_sub_v2i16: ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_subrev_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_subrev_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI: v_subrev_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -50,7 +50,7 @@ ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} ; VI: v_subrev_i32_e32 -; VI: v_subrev_i32_e32 +; VI: v_subrev_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 { %add = sub <2 x i16> %a, %b store <2 x i16> %add, <2 x i16> addrspace(1)* %out @@ -252,7 +252,7 @@ ; GFX9: v_pk_sub_i16 ; GFX9: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; VI: v_subrev_u16_e32 +; VI: v_subrev_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI: v_subrev_u16_e32 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 Index: test/CodeGen/AMDGPU/uitofp.f16.ll =================================================================== --- test/CodeGen/AMDGPU/uitofp.f16.ll +++ test/CodeGen/AMDGPU/uitofp.f16.ll @@ -38,16 +38,22 @@ ; GCN-LABEL: {{^}}uitofp_v2i16_to_v2f16 ; GCN: buffer_load_dword -; SI: v_cvt_f32_u32_e32 -; SI: v_cvt_f32_u32_e32 -; VI: v_cvt_f32_i32_e32 -; VI: v_cvt_f32_i32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN-DAG: v_lshlrev_b32_e32 -; GCN-DAG: v_or_b32_e32 -; GCN: buffer_store_dword -; GCN: s_endpgm + +; SI: v_cvt_f32_u32_e32 +; SI: v_cvt_f32_u32_e32 +; SI: v_cvt_f16_f32_e32 +; SI: v_cvt_f16_f32_e32 +; SI-DAG: v_lshlrev_b32_e32 +; SI: v_or_b32_e32 + +; VI-DAG: v_cvt_f16_f32_e32 +; VI-DAG: v_cvt_f32_i32_sdwa +; VI-DAG: v_cvt_f32_i32_sdwa +; VI-DAG: v_cvt_f16_f32_sdwa +; VI: v_or_b32_e32 + +; GCN: buffer_store_dword +; GCN: s_endpgm define amdgpu_kernel void @uitofp_v2i16_to_v2f16( <2 x half> addrspace(1)* %r, <2 x i16> addrspace(1)* %a) { @@ -60,12 +66,20 @@ ; GCN-LABEL: {{^}}uitofp_v2i32_to_v2f16 ; GCN: buffer_load_dwordx2 -; GCN: v_cvt_f32_u32_e32 -; GCN: v_cvt_f32_u32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN-DAG: v_lshlrev_b32_e32 -; GCN-DAG: v_or_b32_e32 + +; SI: v_cvt_f32_u32_e32 +; SI: v_cvt_f32_u32_e32 +; SI: v_cvt_f16_f32_e32 +; SI: v_cvt_f16_f32_e32 +; SI-DAG: v_lshlrev_b32_e32 +; SI: v_or_b32_e32 + +; VI-DAG: v_cvt_f32_u32_e32 +; VI-DAG: v_cvt_f32_u32_e32 +; VI-DAG: v_cvt_f16_f32_e32 +; VI-DAG: v_cvt_f16_f32_sdwa +; VI: v_or_b32_e32 + ; GCN: buffer_store_dword ; GCN: s_endpgm define amdgpu_kernel void @uitofp_v2i32_to_v2f16( Index: test/CodeGen/AMDGPU/v_mac_f16.ll =================================================================== --- test/CodeGen/AMDGPU/v_mac_f16.ll +++ test/CodeGen/AMDGPU/v_mac_f16.ll @@ -318,18 +318,14 @@ ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]] ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; VI-NOT: and -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] -; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; VI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] - -; VI-DAG: v_mac_f16_e32 v[[C_F16_1]], v[[B_F16_1]], v[[A_F16_1]] -; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_1:[0-9]+]], 16, v[[C_F16_1]] -; VI-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[B_V2_F16]], v[[A_V2_F16]] +; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; VI-DAG: v_mac_f16_sdwa v[[A_F16_1]], v[[C_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mac_f16_e32 v[[A_V2_F16]], v[[C_V2_F16]], v[[B_V2_F16]] +; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[A_F16_1]] ; VI-NOT: and -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[C_V2_F16]] - +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[A_V2_F16]] ; GCN: {{buffer|flat}}_store_dword v[[R_V2_F16]] ; GCN: s_endpgm @@ -355,10 +351,12 @@ ; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD0:v[0-9]+]] -; VI: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD1:v[0-9]+]] -; VI-DAG: v_mac_f16_e32 [[ADD0]], v{{[0-9]+}}, v{{[0-9]+}} -; VI-DAG: v_mac_f16_e32 [[ADD1]], v{{[0-9]+}}, v{{[0-9]+}} + +; VI-DAG: v_mac_f16_sdwa v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + ; GCN: s_endpgm define amdgpu_kernel void @mac_v2f16_same_add( <2 x half> addrspace(1)* %r0, @@ -478,14 +476,17 @@ } ; GCN-LABEL: {{^}}mac_v2f16_neg_a_safe_fp_math: + ; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} ; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]] + ; VI: v_sub_f16_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} ; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} -; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] -; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]] +; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]] + ; GCN: s_endpgm define amdgpu_kernel void @mac_v2f16_neg_a_safe_fp_math( <2 x half> addrspace(1)* %r, @@ -506,14 +507,17 @@ } ; GCN-LABEL: {{^}}mac_v2f16_neg_b_safe_fp_math: + ; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} ; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}} + ; VI: v_sub_f16_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} ; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} -; VI: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} -; VI: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}} +; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}} + ; GCN: s_endpgm define amdgpu_kernel void @mac_v2f16_neg_b_safe_fp_math( <2 x half> addrspace(1)* %r, @@ -534,14 +538,17 @@ } ; GCN-LABEL: {{^}}mac_v2f16_neg_c_safe_fp_math: + ; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} ; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} ; SI-DAG: v_mac_f32_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}} ; SI-DAG: v_mac_f32_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}} + ; VI: v_sub_f16_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} ; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} -; VI: v_mac_f16_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_mac_f16_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_mac_f16_sdwa v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mac_f16_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}} + ; GCN: s_endpgm define amdgpu_kernel void @mac_v2f16_neg_c_safe_fp_math( <2 x half> addrspace(1)* %r,