diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -45,6 +45,8 @@ MachineIRBuilder &B) const; bool legalizeFrint(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; + bool legalizeFnearbyint(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1089,6 +1089,23 @@ .clampScalar(0, S32, S64) .scalarize(0); } + + if (ST.has16BitInsts()) { + getActionDefinitionsBuilder(G_FNEARBYINT) + .customFor({S16, S32, S64}) + .clampScalar(0, S16, S64) + .scalarize(0); + } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { + getActionDefinitionsBuilder(G_FNEARBYINT) + .customFor({S32, S64}) + .clampScalar(0, S32, S64) + .scalarize(0); + } else { + getActionDefinitionsBuilder(G_FNEARBYINT) + .customFor({S32, S64}) + .clampScalar(0, S32, S64) + .scalarize(0); + } getActionDefinitionsBuilder(G_PTR_ADD) .unsupportedFor({BufferFatPtr, RsrcPtr}) @@ -1967,6 +1984,8 @@ return legalizeAddrSpaceCast(MI, MRI, B); case TargetOpcode::G_FRINT: return legalizeFrint(MI, MRI, B); + case TargetOpcode::G_FNEARBYINT: + return legalizeFnearbyint(MI, MRI, B); case TargetOpcode::G_FCEIL: return legalizeFceil(MI, MRI, B); case TargetOpcode::G_FREM: @@ -2297,6 +2316,20 @@ return true; } +bool AMDGPULegalizerInfo::legalizeFnearbyint(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + // FNEARBYINT and FRINT are the same, except in their handling of FP + // exceptions. Those aren't really meaningful for us, and OpenCL only has + // rint, so just treat them as equivalent. + const unsigned Opcode = TargetOpcode::G_FRINT; + B.buildInstr(Opcode) + .addDef(MI.getOperand(0).getReg()) + .addUse(MI.getOperand(1).getReg()); + + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::legalizeFceil( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll --- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll +++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll @@ -2,7 +2,10 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SICI,SI %s ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=SICI,CI %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s declare half @llvm.nearbyint.f16(half) #0 declare float @llvm.nearbyint.f32(float) #0 @@ -50,18 +53,53 @@ ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fnearbyint_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rndne_f16_e32 v1, s2 -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10-SDAG-LABEL: fnearbyint_f16: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_rndne_f16_e32 v1, s4 +; GFX10-SDAG-NEXT: global_store_short v0, v1, s[2:3] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: fnearbyint_f16: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_rndne_f16_e32 v0, s4 +; GFX10-GISEL-NEXT: global_store_short v1, v0, s[2:3] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: fnearbyint_f16: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_rndne_f16_e32 v1, s2 +; GFX11-SDAG-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: fnearbyint_f16: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_rndne_f16_e32 v0, s2 +; GFX11-GISEL-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm %1 = call half @llvm.nearbyint.f16(half %in) store half %1, ptr addrspace(1) %out ret void @@ -90,18 +128,53 @@ ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fnearbyint_f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rndne_f32_e32 v1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10-SDAG-LABEL: fnearbyint_f32: +; GFX10-SDAG: ; %bb.0: ; %entry +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_rndne_f32_e32 v1, s4 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: fnearbyint_f32: +; GFX10-GISEL: ; %bb.0: ; %entry +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_rndne_f32_e32 v0, s4 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: fnearbyint_f32: +; GFX11-SDAG: ; %bb.0: ; %entry +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_rndne_f32_e32 v1, s2 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: fnearbyint_f32: +; GFX11-GISEL: ; %bb.0: ; %entry +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_rndne_f32_e32 v0, s2 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm entry: %0 = call float @llvm.nearbyint.f32(float %in) store float %0, ptr addrspace(1) %out @@ -133,17 +206,49 @@ ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fnearbyint_v2f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rndne_f32_e32 v1, s3 -; GFX11-NEXT: v_rndne_f32_e32 v0, s2 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10-SDAG-LABEL: fnearbyint_v2f32: +; GFX10-SDAG: ; %bb.0: ; %entry +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_rndne_f32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_rndne_f32_e32 v0, s2 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: fnearbyint_v2f32: +; GFX10-GISEL: ; %bb.0: ; %entry +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_rndne_f32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_rndne_f32_e32 v1, s3 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: fnearbyint_v2f32: +; GFX11-SDAG: ; %bb.0: ; %entry +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_rndne_f32_e32 v1, s3 +; GFX11-SDAG-NEXT: v_rndne_f32_e32 v0, s2 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: fnearbyint_v2f32: +; GFX11-GISEL: ; %bb.0: ; %entry +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_rndne_f32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_rndne_f32_e32 v1, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm entry: %0 = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %in) store <2 x float> %0, ptr addrspace(1) %out @@ -179,21 +284,65 @@ ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fnearbyint_v4f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rndne_f32_e32 v3, s7 -; GFX11-NEXT: v_rndne_f32_e32 v2, s6 -; GFX11-NEXT: v_rndne_f32_e32 v1, s5 -; GFX11-NEXT: v_rndne_f32_e32 v0, s4 -; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10-SDAG-LABEL: fnearbyint_v4f32: +; GFX10-SDAG: ; %bb.0: ; %entry +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_rndne_f32_e32 v3, s7 +; GFX10-SDAG-NEXT: v_rndne_f32_e32 v2, s6 +; GFX10-SDAG-NEXT: v_rndne_f32_e32 v1, s5 +; GFX10-SDAG-NEXT: v_rndne_f32_e32 v0, s4 +; GFX10-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: fnearbyint_v4f32: +; GFX10-GISEL: ; %bb.0: ; %entry +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_rndne_f32_e32 v0, s4 +; GFX10-GISEL-NEXT: v_rndne_f32_e32 v1, s5 +; GFX10-GISEL-NEXT: v_rndne_f32_e32 v2, s6 +; GFX10-GISEL-NEXT: v_rndne_f32_e32 v3, s7 +; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: fnearbyint_v4f32: +; GFX11-SDAG: ; %bb.0: ; %entry +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_rndne_f32_e32 v3, s7 +; GFX11-SDAG-NEXT: v_rndne_f32_e32 v2, s6 +; GFX11-SDAG-NEXT: v_rndne_f32_e32 v1, s5 +; GFX11-SDAG-NEXT: v_rndne_f32_e32 v0, s4 +; GFX11-SDAG-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: fnearbyint_v4f32: +; GFX11-GISEL: ; %bb.0: ; %entry +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_rndne_f32_e32 v0, s4 +; GFX11-GISEL-NEXT: v_rndne_f32_e32 v1, s5 +; GFX11-GISEL-NEXT: v_rndne_f32_e32 v2, s6 +; GFX11-GISEL-NEXT: v_rndne_f32_e32 v3, s7 +; GFX11-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm entry: %0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %in) store <4 x float> %0, ptr addrspace(1) %out @@ -245,6 +394,15 @@ ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; +; GFX10-LABEL: nearbyint_f64: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_rndne_f64_e32 v[0:1], s[2:3] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_endpgm +; ; GFX11-LABEL: nearbyint_f64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 @@ -318,19 +476,55 @@ ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: nearbyint_v2f64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rndne_f64_e32 v[2:3], s[6:7] -; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[4:5] -; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10-SDAG-LABEL: nearbyint_v2f64: +; GFX10-SDAG: ; %bb.0: ; %entry +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_rndne_f64_e32 v[2:3], s[6:7] +; GFX10-SDAG-NEXT: v_rndne_f64_e32 v[0:1], s[4:5] +; GFX10-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: nearbyint_v2f64: +; GFX10-GISEL: ; %bb.0: ; %entry +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_rndne_f64_e32 v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_rndne_f64_e32 v[2:3], s[6:7] +; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: nearbyint_v2f64: +; GFX11-SDAG: ; %bb.0: ; %entry +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_rndne_f64_e32 v[2:3], s[6:7] +; GFX11-SDAG-NEXT: v_rndne_f64_e32 v[0:1], s[4:5] +; GFX11-SDAG-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: nearbyint_v2f64: +; GFX11-GISEL: ; %bb.0: ; %entry +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_rndne_f64_e32 v[0:1], s[4:5] +; GFX11-GISEL-NEXT: v_rndne_f64_e32 v[2:3], s[6:7] +; GFX11-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm entry: %0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %in) store <2 x double> %0, ptr addrspace(1) %out @@ -422,23 +616,69 @@ ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: nearbyint_v4f64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x44 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v8, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rndne_f64_e32 v[6:7], s[10:11] -; GFX11-NEXT: v_rndne_f64_e32 v[4:5], s[8:9] -; GFX11-NEXT: v_rndne_f64_e32 v[2:3], s[6:7] -; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[4:5] -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 -; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10-SDAG-LABEL: nearbyint_v4f64: +; GFX10-SDAG: ; %bb.0: ; %entry +; GFX10-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_rndne_f64_e32 v[6:7], s[10:11] +; GFX10-SDAG-NEXT: v_rndne_f64_e32 v[4:5], s[8:9] +; GFX10-SDAG-NEXT: v_rndne_f64_e32 v[2:3], s[6:7] +; GFX10-SDAG-NEXT: v_rndne_f64_e32 v[0:1], s[4:5] +; GFX10-SDAG-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX10-SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: nearbyint_v4f64: +; GFX10-GISEL: ; %bb.0: ; %entry +; GFX10-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_rndne_f64_e32 v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_rndne_f64_e32 v[2:3], s[6:7] +; GFX10-GISEL-NEXT: v_rndne_f64_e32 v[4:5], s[8:9] +; GFX10-GISEL-NEXT: v_rndne_f64_e32 v[6:7], s[10:11] +; GFX10-GISEL-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX10-GISEL-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: nearbyint_v4f64: +; GFX11-SDAG: ; %bb.0: ; %entry +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x44 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_rndne_f64_e32 v[6:7], s[10:11] +; GFX11-SDAG-NEXT: v_rndne_f64_e32 v[4:5], s[8:9] +; GFX11-SDAG-NEXT: v_rndne_f64_e32 v[2:3], s[6:7] +; GFX11-SDAG-NEXT: v_rndne_f64_e32 v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: nearbyint_v4f64: +; GFX11-GISEL: ; %bb.0: ; %entry +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x44 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_rndne_f64_e32 v[0:1], s[4:5] +; GFX11-GISEL-NEXT: v_rndne_f64_e32 v[2:3], s[6:7] +; GFX11-GISEL-NEXT: v_rndne_f64_e32 v[4:5], s[8:9] +; GFX11-GISEL-NEXT: v_rndne_f64_e32 v[6:7], s[10:11] +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX11-GISEL-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm entry: %0 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %in) store <4 x double> %0, ptr addrspace(1) %out