diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll --- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll @@ -1,13 +1,22 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG-GFX11 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,SDAG-GFX9 %s ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,SDAG-VI %s -; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CI,SDAG-CI %s +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-CI %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GISEL-GFX9 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,GISEL-VI %s -; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CI,GISEL-CI %s +; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL-CI %s define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo(half %src0, half %src1, half %src2) #0 { +; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -51,6 +60,16 @@ } define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %src1, half %src2) #0 { +; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0x3c00 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -99,6 +118,15 @@ } define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src1, half %src2, half %lo) #0 { +; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -147,6 +175,15 @@ } define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, half %src2) #0 { +; SDAG-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; SDAG-GFX11-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] +; ; SDAG-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: ; SDAG-GFX9: ; %bb.0: ; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -172,6 +209,16 @@ ; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GISEL-GFX11-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: ; GISEL-GFX9: ; %bb.0: ; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -202,6 +249,15 @@ } define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext(half %src0, half %src1, half %src2) #0 { +; SDAG-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; SDAG-GFX11-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] +; ; SDAG-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext: ; SDAG-GFX9: ; %bb.0: ; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -227,6 +283,16 @@ ; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GISEL-GFX11-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext: ; GISEL-GFX9: ; %bb.0: ; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -270,6 +336,16 @@ } define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt(half %src0, half %src1, half %src2) #0 { +; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -316,6 +392,13 @@ } define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half %src0, half %src1, half %src2) #0 { +; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -369,6 +452,16 @@ } define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use(half %src0, half %src1, half %src2) #0 { +; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX11-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GFX11-NEXT: global_store_b16 v[0:1], v3, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -445,5 +538,3 @@ attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } attributes #1 = { nounwind readnone speculatable } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CI: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -1,15 +1,24 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100,SDAG-GFX1100 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900,SDAG-GFX900 %s ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906,SDAG-GFX906 %s ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,SDAG-VI %s -; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CI,SDAG-CI %s +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-CI %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100,GISEL-GFX1100 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900,GISEL-GFX900 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906,GISEL-GFX906 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,GISEL-VI %s -; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CI,GISEL-CI %s +; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL-CI %s define half @mixlo_simple(float %src0, float %src1, float %src2) #0 { +; GFX1100-LABEL: mixlo_simple: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: mixlo_simple: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -49,6 +58,13 @@ } define half @v_mad_mixlo_f16_f16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 { +; GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -97,6 +113,13 @@ } define half @v_mad_mixlo_f16_f16lo_f16lo_f32(half %src0, half %src1, float %src2) #0 { +; GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -142,6 +165,13 @@ } define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %src1, float %src2) #0 { +; GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -197,6 +227,15 @@ } define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt(half %src0, half %src1, float %src2) #0 { +; GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -249,6 +288,16 @@ ; operation only clobbers relevant lane. define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { +; GFX1100-LABEL: v_mad_mix_v2f32: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX1100-NEXT: v_mov_b32_e32 v0, v3 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: v_mad_mix_v2f32: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -343,6 +392,17 @@ } define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 { +; GFX1100-LABEL: v_mad_mix_v3f32: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; GFX1100-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX1100-NEXT: v_mov_b32_e32 v0, v6 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; SDAG-GFX900-LABEL: v_mad_mix_v3f32: ; SDAG-GFX900: ; %bb.0: ; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -481,6 +541,19 @@ } define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { +; GFX1100-LABEL: v_mad_mix_v4f32: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; SDAG-GFX900-LABEL: v_mad_mix_v4f32: ; SDAG-GFX900: ; %bb.0: ; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -655,6 +728,16 @@ ; FIXME (DAG): Fold clamp define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { +; GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX1100-NEXT: v_mov_b32_e32 v0, v3 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -768,6 +851,20 @@ ; FIXME (GIsel): V_PK_MAX clamp could be folded into mixlo define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 { +; SDAG-GFX1100-LABEL: v_mad_mix_v3f32_clamp_postcvt: +; SDAG-GFX1100: ; %bb.0: +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-NEXT: v_pack_b32_f16 v1, v1, 0 +; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-NEXT: v_pk_max_f16 v1, v1, v1 clamp +; SDAG-GFX1100-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; SDAG-GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt: ; SDAG-GFX900: ; %bb.0: ; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -843,6 +940,19 @@ ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mix_v3f32_clamp_postcvt: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: v_pk_max_f16 v1, v1, v1 clamp +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, v6 +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt: ; GISEL-GFX900: ; %bb.0: ; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -934,6 +1044,19 @@ } define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { +; GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1115,6 +1238,16 @@ ; a build_vector to select the mixhi. Issue is more specifically with how insert_vector_elt is being ; legalized (bitwise ops instead of shuffle/build_vector for instance). define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { +; SDAG-GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: +; SDAG-GFX1100: ; %bb.0: +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; SDAG-GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: ; SDAG-GFX900: ; %bb.0: ; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1170,6 +1303,20 @@ ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GISEL-GFX1100-NEXT: v_mov_b32_e32 v4, v3 +; GISEL-GFX1100-NEXT: v_max_f16_e64 v3, v3, v3 clamp +; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v4, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GISEL-GFX1100-NEXT: v_and_or_b32 v0, 0xffff0000, v4, v0 +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: ; GISEL-GFX900: ; %bb.0: ; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1249,6 +1396,16 @@ } define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { +; SDAG-GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: +; SDAG-GFX1100: ; %bb.0: +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; SDAG-GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: ; SDAG-GFX900: ; %bb.0: ; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1304,6 +1461,20 @@ ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v4, v0, v1, v2 op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX1100-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v4, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GISEL-GFX1100-NEXT: v_and_or_b32 v0, 0xffff, v4, v0 +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: ; GISEL-GFX900: ; %bb.0: ; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1390,6 +1561,19 @@ ; FIXME (DAG): Should be able to use mixlo/mixhi define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { +; SDAG-GFX1100-LABEL: v_mad_mix_v2f32_clamp_precvt: +; SDAG-GFX1100: ; %bb.0: +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; SDAG-GFX1100-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SDAG-GFX1100-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX1100-NEXT: v_pack_b32_f16 v0, v0, v1 +; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; SDAG-GFX900-LABEL: v_mad_mix_v2f32_clamp_precvt: ; SDAG-GFX900: ; %bb.0: ; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1449,6 +1633,19 @@ ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_clamp_precvt: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_pack_b32_f16 v0, v1, v0 +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-GFX900-LABEL: v_mad_mix_v2f32_clamp_precvt: ; GISEL-GFX900: ; %bb.0: ; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1512,6 +1709,21 @@ ; FIXME (DAG): Handling undef 4th component define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 { +; SDAG-GFX1100-LABEL: v_mad_mix_v3f32_clamp_precvt: +; SDAG-GFX1100: ; %bb.0: +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; SDAG-GFX1100-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; SDAG-GFX1100-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SDAG-GFX1100-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-GFX1100-NEXT: v_pack_b32_f16 v0, v0, v2 +; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; SDAG-GFX900-LABEL: v_mad_mix_v3f32_clamp_precvt: ; SDAG-GFX900: ; %bb.0: ; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1589,6 +1801,21 @@ ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mix_v3f32_clamp_precvt: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v2, v6 +; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-GFX1100-NEXT: v_pack_b32_f16 v0, v2, v0 +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-GFX900-LABEL: v_mad_mix_v3f32_clamp_precvt: ; GISEL-GFX900: ; %bb.0: ; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1664,6 +1891,25 @@ } define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { +; SDAG-GFX1100-LABEL: v_mad_mix_v4f32_clamp_precvt: +; SDAG-GFX1100: ; %bb.0: +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; SDAG-GFX1100-NEXT: v_fma_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-NEXT: v_fma_mix_f32 v7, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; SDAG-GFX1100-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SDAG-GFX1100-NEXT: v_cvt_f16_f32_e32 v3, v7 +; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; SDAG-GFX1100-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-GFX1100-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-NEXT: v_pack_b32_f16 v0, v0, v3 +; SDAG-GFX1100-NEXT: v_pack_b32_f16 v1, v1, v2 +; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; SDAG-GFX900-LABEL: v_mad_mix_v4f32_clamp_precvt: ; SDAG-GFX900: ; %bb.0: ; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1762,6 +2008,25 @@ ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mix_v4f32_clamp_precvt: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v3, v6 +; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX1100-NEXT: v_pack_b32_f16 v0, v3, v0 +; GISEL-GFX1100-NEXT: v_pack_b32_f16 v1, v2, v1 +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-GFX900-LABEL: v_mad_mix_v4f32_clamp_precvt: ; GISEL-GFX900: ; %bb.0: ; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1880,5 +2145,3 @@ attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } attributes #1 = { nounwind readnone speculatable } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CI: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll --- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll @@ -1,15 +1,24 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100,SDAG-GFX1100 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900,SDAG-GFX900 %s ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906,SDAG-GFX906 %s ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,SDAG-VI %s ; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CI,SDAG-CI %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100,GISEL-GFX1100 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900,GISEL-GFX900 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906,GISEL-GFX906 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,GISEL-VI %s ; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CI,GISEL-CI %s define float @v_mad_mix_f32_f16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 { +; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -53,6 +62,13 @@ } define float @v_mad_mix_f32_f16hi_f16hi_f16hi_int(i32 %src0, i32 %src1, i32 %src2) #0 { +; GFX1100-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_int: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_int: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -102,6 +118,13 @@ } define float @v_mad_mix_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { +; GFX1100-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -148,6 +171,16 @@ } define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { +; GFX1100-LABEL: v_mad_mix_v2f32: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-NEXT: v_mov_b32_e32 v0, v3 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; SDAG-GFX900-LABEL: v_mad_mix_v2f32: ; SDAG-GFX900: ; %bb.0: ; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -246,6 +279,16 @@ } define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { +; GFX1100-LABEL: v_mad_mix_v2f32_shuffle: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1] +; GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1] +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-NEXT: v_mov_b32_e32 v0, v3 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: v_mad_mix_v2f32_shuffle: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -322,6 +365,13 @@ } define float @v_mad_mix_f32_negf16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 { +; GFX1100-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -375,6 +425,13 @@ } define float @v_mad_mix_f32_absf16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 { +; GFX1100-LABEL: v_mad_mix_f32_absf16lo_f16lo_f16lo: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: v_mad_mix_f32_absf16lo_f16lo_f16lo: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -419,6 +476,13 @@ } define float @v_mad_mix_f32_negabsf16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 { +; GFX1100-LABEL: v_mad_mix_f32_negabsf16lo_f16lo_f16lo: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mix_f32 v0, -|v0|, v1, v2 op_sel_hi:[1,1,1] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: v_mad_mix_f32_negabsf16lo_f16lo_f16lo: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -464,6 +528,13 @@ } define float @v_mad_mix_f32_f16lo_f16lo_f32(half %src0, half %src1, float %src2) #0 { +; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f32: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -504,6 +575,13 @@ } define float @v_mad_mix_f32_f16lo_f16lo_negf32(half %src0, half %src1, float %src2) #0 { +; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_negf32: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, -v2 op_sel_hi:[1,1,0] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_negf32: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -545,6 +623,13 @@ } define float @v_mad_mix_f32_f16lo_f16lo_absf32(half %src0, half %src1, float %src2) #0 { +; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_absf32: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, |v2| op_sel_hi:[1,1,0] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_absf32: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -586,6 +671,13 @@ } define float @v_mad_mix_f32_f16lo_f16lo_negabsf32(half %src0, half %src1, float %src2) #0 { +; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_negabsf32: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, -|v2| op_sel_hi:[1,1,0] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_negabsf32: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -632,6 +724,15 @@ ; inline immediate. define float @v_mad_mix_f32_f16lo_f16lo_f32imm1(half %src0, half %src1) #0 { +; SDAG-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1: +; SDAG-GFX1100: ; %bb.0: +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; SDAG-GFX1100-NEXT: s_mov_b32 s0, 1.0 +; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, s0 op_sel_hi:[1,1,0] +; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; SDAG-GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1: ; SDAG-GFX900: ; %bb.0: ; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -660,6 +761,15 @@ ; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, 1.0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GISEL-GFX1100-NEXT: v_mov_b32_e32 v2, 1.0 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1: ; GISEL-GFX900: ; %bb.0: ; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -688,6 +798,15 @@ } define float @v_mad_mix_f32_f16lo_f16lo_f32imminv2pi(half %src0, half %src1) #0 { +; SDAG-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi: +; SDAG-GFX1100: ; %bb.0: +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; SDAG-GFX1100-NEXT: s_mov_b32 s0, 0.15915494 +; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, s0 op_sel_hi:[1,1,0] +; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; SDAG-GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi: ; SDAG-GFX900: ; %bb.0: ; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -716,6 +835,15 @@ ; SDAG-CI-NEXT: v_madak_f32 v0, v0, v1, 0x3e22f983 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GISEL-GFX1100-NEXT: v_mov_b32_e32 v2, 0.15915494 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi: ; GISEL-GFX900: ; %bb.0: ; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -751,6 +879,15 @@ ; f32 1/2pi = 0x3e22f983 define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi(half %src0, half %src1) #0 { +; SDAG-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: +; SDAG-GFX1100: ; %bb.0: +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; SDAG-GFX1100-NEXT: s_mov_b32 s0, 0x3e230000 +; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, s0 op_sel_hi:[1,1,0] +; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; SDAG-GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: ; SDAG-GFX900: ; %bb.0: ; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -779,6 +916,15 @@ ; SDAG-CI-NEXT: v_madak_f32 v0, v0, v1, 0x3e230000 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GISEL-GFX1100-NEXT: v_mov_b32_e32 v2, 0x3e230000 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: ; GISEL-GFX900: ; %bb.0: ; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -819,6 +965,15 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imm63(half %src0, half %src1) #0 { +; SDAG-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: +; SDAG-GFX1100: ; %bb.0: +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; SDAG-GFX1100-NEXT: s_mov_b32 s0, 0x367c0000 +; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, s0 op_sel_hi:[1,1,0] +; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; SDAG-GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: ; SDAG-GFX900: ; %bb.0: ; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -847,6 +1002,15 @@ ; SDAG-CI-NEXT: v_madak_f32 v0, v0, v1, 0x367c0000 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GISEL-GFX1100-NEXT: v_mov_b32_e32 v2, 0x367c0000 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: ; GISEL-GFX900: ; %bb.0: ; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -886,6 +1050,17 @@ } define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1) #0 { +; GFX1100-LABEL: v_mad_mix_v2f32_f32imm1: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: s_mov_b32 s0, 1.0 +; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-NEXT: v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0] +; GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX1100-NEXT: v_mov_b32_e32 v0, v2 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; SDAG-GFX900-LABEL: v_mad_mix_v2f32_f32imm1: ; SDAG-GFX900: ; %bb.0: ; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -976,6 +1151,17 @@ } define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 { +; GFX1100-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: s_mov_b32 s0, 0x3e230000 +; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-NEXT: v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0] +; GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX1100-NEXT: v_mov_b32_e32 v0, v2 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; SDAG-GFX900-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi: ; SDAG-GFX900: ; %bb.0: ; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1071,6 +1257,17 @@ } define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 { +; GFX1100-LABEL: v_mad_mix_v2f32_f32imminv2pi: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: s_mov_b32 s0, 0.15915494 +; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-NEXT: v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0] +; GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX1100-NEXT: v_mov_b32_e32 v0, v2 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; SDAG-GFX900-LABEL: v_mad_mix_v2f32_f32imminv2pi: ; SDAG-GFX900: ; %bb.0: ; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1164,6 +1361,13 @@ } define float @v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { +; GFX1100-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1212,6 +1416,13 @@ } define float @no_mix_simple(float %src0, float %src1, float %src2) #0 { +; GFX1100-LABEL: no_mix_simple: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: no_mix_simple: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1240,6 +1451,13 @@ } define float @no_mix_simple_fabs(float %src0, float %src1, float %src2) #0 { +; GFX1100-LABEL: no_mix_simple_fabs: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_f32 v0, |v0|, v1, v2 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: no_mix_simple_fabs: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1273,6 +1491,13 @@ ; v_mad_mix_f32 flushes. define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals(half %src0, half %src1, half %src2) #1 { +; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1320,6 +1545,13 @@ } define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals(half %src0, half %src1, float %src2) #1 { +; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1363,6 +1595,18 @@ } define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, half %src1, half %src2) #1 { +; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX1100-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX1100-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1418,6 +1662,17 @@ } define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, half %src1, float %src2) #1 { +; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX1100-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1468,6 +1723,13 @@ } define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd(half %src0, half %src1, half %src2) #0 { +; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1512,6 +1774,13 @@ } define float @v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd(half %src0, half %src1, float %src2) #0 { +; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1553,6 +1822,13 @@ } define float @v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 { +; GFX1100-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1612,6 +1888,16 @@ ; Make sure we don't fold pre-cvt fneg if we already have a fabs define float @v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 { +; GFX1100-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1100-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1666,6 +1952,13 @@ } define float @v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 { +; GFX1100-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1715,6 +2008,13 @@ } define float @v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 { +; GFX1100-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mix_f32 v0, -v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1775,6 +2075,13 @@ } define float @v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 { +; GFX1100-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1835,6 +2142,13 @@ } define float @v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 { +; GFX1100-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mix_f32 v0, -|v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll --- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll @@ -1,9 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-SAFE %s ; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=VI,VI-SAFE %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SAFE %s ; RUN: llc -march=amdgcn -mcpu=hawaii -enable-no-signed-zeros-fp-math < %s | FileCheck -check-prefixes=CI,CI-NSZ %s ; RUN: llc -march=amdgcn -mcpu=fiji -enable-no-signed-zeros-fp-math < %s | FileCheck -check-prefixes=VI,VI-NSZ %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -enable-no-signed-zeros-fp-math < %s | FileCheck -check-prefixes=GFX11,GFX11-NSZ %s define half @add_select_fabs_fabs_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_fabs_fabs_f16: @@ -27,6 +29,16 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NEXT: v_add_f16_e64 v0, |v0|, v3 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_select_fabs_fabs_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs.x = call half @llvm.fabs.f16(half %x) %fabs.y = call half @llvm.fabs.f16(half %y) @@ -61,6 +73,17 @@ ; VI-NEXT: v_add_f16_e64 v0, |v0|, v4 ; VI-NEXT: v_add_f16_e64 v1, |v1|, v3 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_select_multi_use_lhs_fabs_fabs_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_add_f16_e64 v1, |v1|, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v4 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs.x = call half @llvm.fabs.f16(half %x) %fabs.y = call half @llvm.fabs.f16(half %y) @@ -97,6 +120,17 @@ ; VI-NEXT: v_add_f16_e64 v0, |v0|, v3 ; VI-NEXT: v_mov_b32_e32 v1, v4 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_select_multi_store_use_lhs_fabs_fabs_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs.x = call half @llvm.fabs.f16(half %x) %fabs.y = call half @llvm.fabs.f16(half %y) @@ -133,6 +167,17 @@ ; VI-NEXT: v_add_f16_e64 v0, |v0|, v3 ; VI-NEXT: v_add_f16_e64 v1, |v2|, v4 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_select_multi_use_rhs_fabs_fabs_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_add_f16_e64 v1, |v2|, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs.x = call half @llvm.fabs.f16(half %x) %fabs.y = call half @llvm.fabs.f16(half %y) @@ -167,6 +212,17 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NEXT: v_add_f16_e32 v0, v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_select_fabs_var_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_add_f16_e32 v0, v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs.x = call half @llvm.fabs.f16(half %x) %select = select i1 %cmp, half %fabs.x, half %y @@ -196,6 +252,17 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; VI-NEXT: v_add_f16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_select_fabs_negk_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo +; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs = call half @llvm.fabs.f16(half %x) %select = select i1 %cmp, half %fabs, half -1.0 @@ -224,6 +291,17 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; VI-NEXT: v_add_f16_e64 v0, |v0|, v1 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_select_fabs_negk_negk_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0xc000 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo +; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, half -2.0, half -1.0 %fabs = call half @llvm.fabs.f16(half %select) @@ -251,6 +329,17 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; VI-NEXT: v_add_f16_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_select_posk_posk_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo +; GFX11-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, half 2.0, half 1.0 %add = fadd half %select, %x @@ -279,6 +368,17 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; VI-NEXT: v_add_f16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_select_negk_fabs_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo +; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs = call half @llvm.fabs.f16(half %x) %select = select i1 %cmp, half -1.0, half %fabs @@ -309,6 +409,17 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; VI-NEXT: v_add_f16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_select_negliteralk_fabs_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xe400, v1, vcc_lo +; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs = call half @llvm.fabs.f16(half %x) %select = select i1 %cmp, half -1024.0, half %fabs @@ -337,6 +448,16 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; VI-NEXT: v_add_f16_e64 v0, |v0|, v2 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_select_fabs_posk_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs = call half @llvm.fabs.f16(half %x) %select = select i1 %cmp, half %fabs, half 1.0 @@ -365,6 +486,16 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; VI-NEXT: v_add_f16_e64 v0, |v0|, v2 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_select_posk_fabs_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs = call half @llvm.fabs.f16(half %x) %select = select i1 %cmp, half 1.0, half %fabs @@ -394,6 +525,16 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NEXT: v_sub_f16_e32 v0, v3, v0 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_select_fneg_fneg_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fneg.x = fneg half %x %fneg.y = fneg half %y @@ -428,6 +569,17 @@ ; VI-NEXT: v_sub_f16_e32 v0, v3, v0 ; VI-NEXT: v_sub_f16_e32 v1, v4, v1 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_select_multi_use_lhs_fneg_fneg_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_sub_f16_e32 v1, v4, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fneg.x = fneg half %x %fneg.y = fneg half %y @@ -464,6 +616,17 @@ ; VI-NEXT: v_sub_f16_e32 v0, v3, v0 ; VI-NEXT: v_mov_b32_e32 v1, v4 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_select_multi_store_use_lhs_fneg_fneg_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fneg.x = fneg half %x %fneg.y = fneg half %y @@ -500,6 +663,17 @@ ; VI-NEXT: v_sub_f16_e32 v0, v3, v0 ; VI-NEXT: v_sub_f16_e32 v1, v4, v2 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_select_multi_use_rhs_fneg_fneg_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_sub_f16_e32 v1, v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fneg.x = fneg half %x %fneg.y = fneg half %y @@ -534,6 +708,17 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NEXT: v_add_f16_e32 v0, v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_select_fneg_var_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_add_f16_e32 v0, v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fneg.x = fneg half %x %select = select i1 %cmp, half %fneg.x, half %y @@ -562,6 +747,16 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; VI-NEXT: v_sub_f16_e32 v0, v2, v0 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_select_fneg_negk_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fneg.x = fneg half %x %select = select i1 %cmp, half %fneg.x, half -1.0 @@ -591,6 +786,16 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; VI-NEXT: v_sub_f16_e32 v0, v2, v0 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_select_fneg_inv2pi_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xb118, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fneg.x = fneg half %x %select = select i1 %cmp, half %fneg.x, half 0xH3118 @@ -620,6 +825,16 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; VI-NEXT: v_sub_f16_e32 v0, v2, v0 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_select_fneg_neginv2pi_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3118, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fneg.x = fneg half %x %select = select i1 %cmp, half %fneg.x, half 0xHB118 @@ -647,6 +862,17 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; VI-NEXT: v_add_f16_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_select_negk_negk_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0xc000 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo +; GFX11-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, half -2.0, half -1.0 %add = fadd half %select, %x @@ -675,6 +901,17 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; VI-NEXT: v_add_f16_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_select_negliteralk_negliteralk_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0xe800 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xec00, v2, vcc_lo +; GFX11-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, half -2048.0, half -4096.0 %add = fadd half %select, %x @@ -701,6 +938,17 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; VI-NEXT: v_sub_f16_e32 v0, v1, v0 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_select_fneg_negk_negk_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0xc000 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo +; GFX11-NEXT: v_sub_f16_e32 v0, v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, half -2.0, half -1.0 %fneg.x = fneg half %select @@ -729,6 +977,16 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; VI-NEXT: v_sub_f16_e32 v0, v2, v0 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_select_negk_fneg_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fneg.x = fneg half %x %select = select i1 %cmp, half -1.0, half %fneg.x @@ -757,6 +1015,16 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; VI-NEXT: v_sub_f16_e32 v0, v2, v0 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_select_fneg_posk_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fneg.x = fneg half %x %select = select i1 %cmp, half %fneg.x, half 1.0 @@ -785,6 +1053,16 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; VI-NEXT: v_sub_f16_e32 v0, v2, v0 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_select_posk_fneg_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fneg.x = fneg half %x %select = select i1 %cmp, half 1.0, half %fneg.x @@ -816,6 +1094,18 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NEXT: v_add_f16_e32 v0, v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_select_negfabs_fabs_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_add_f16_e32 v0, v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs.x = call half @llvm.fabs.f16(half %x) %fneg.fabs.x = fsub half -0.000000e+00, %fabs.x @@ -849,6 +1139,18 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NEXT: v_add_f16_e32 v0, v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_select_fabs_negfabs_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_or_b32_e32 v2, 0x8000, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_add_f16_e32 v0, v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs.x = call half @llvm.fabs.f16(half %x) %fabs.y = call half @llvm.fabs.f16(half %y) @@ -882,6 +1184,18 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NEXT: v_add_f16_e32 v0, v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_select_neg_fabs_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_add_f16_e32 v0, v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fneg.x = fsub half -0.000000e+00, %x %fabs.y = call half @llvm.fabs.f16(half %y) @@ -914,6 +1228,18 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NEXT: v_add_f16_e32 v0, v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_select_fabs_neg_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_add_f16_e32 v0, v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs.x = call half @llvm.fabs.f16(half %x) %fneg.y = fsub half -0.000000e+00, %y @@ -945,6 +1271,17 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NEXT: v_sub_f16_e32 v0, v3, v0 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_select_neg_negfabs_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fneg.x = fsub half -0.000000e+00, %x %fabs.y = call half @llvm.fabs.f16(half %y) @@ -977,6 +1314,17 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; VI-NEXT: v_sub_f16_e32 v0, v3, v0 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: add_select_negfabs_neg_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs.x = call half @llvm.fabs.f16(half %x) %fneg.fabs.x = fsub half -0.000000e+00, %fabs.x @@ -1008,6 +1356,17 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; VI-NEXT: v_mul_f16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: mul_select_negfabs_posk_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x4400, v1, vcc_lo +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs.x = call half @llvm.fabs.f16(half %x) %fneg.fabs.x = fsub half -0.000000e+00, %fabs.x @@ -1038,6 +1397,17 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; VI-NEXT: v_mul_f16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: mul_select_posk_negfabs_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x4400, v1, vcc_lo +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs.x = call half @llvm.fabs.f16(half %x) %fneg.fabs.x = fsub half -0.000000e+00, %fabs.x @@ -1068,6 +1438,17 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; VI-NEXT: v_mul_f16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: mul_select_negfabs_negk_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xc400, v1, vcc_lo +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs.x = call half @llvm.fabs.f16(half %x) %fneg.fabs.x = fsub half -0.000000e+00, %fabs.x @@ -1098,6 +1479,17 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; VI-NEXT: v_mul_f16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: mul_select_negk_negfabs_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xc400, v1, vcc_lo +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs.x = call half @llvm.fabs.f16(half %x) %fneg.fabs.x = fsub half -0.000000e+00, %fabs.x @@ -1131,6 +1523,17 @@ ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-SAFE-LABEL: select_fneg_posk_src_add_f16: +; GFX11-SAFE: ; %bb.0: +; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SAFE-NEXT: v_add_f16_e32 v1, 4.0, v1 +; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo +; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] +; ; CI-NSZ-LABEL: select_fneg_posk_src_add_f16: ; CI-NSZ: ; %bb.0: ; CI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1149,6 +1552,16 @@ ; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NSZ-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-LABEL: select_fneg_posk_src_add_f16: +; GFX11-NSZ: ; %bb.0: +; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NSZ-NEXT: v_sub_f16_e32 v1, -4.0, v1 +; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo +; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %add = fadd half %x, 4.0 %fneg = fneg half %add @@ -1177,6 +1590,17 @@ ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-SAFE-LABEL: select_fneg_posk_src_sub_f16: +; GFX11-SAFE: ; %bb.0: +; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SAFE-NEXT: v_add_f16_e32 v1, -4.0, v1 +; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo +; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] +; ; CI-NSZ-LABEL: select_fneg_posk_src_sub_f16: ; CI-NSZ: ; %bb.0: ; CI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1195,6 +1619,16 @@ ; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NSZ-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-LABEL: select_fneg_posk_src_sub_f16: +; GFX11-NSZ: ; %bb.0: +; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NSZ-NEXT: v_sub_f16_e32 v1, 4.0, v1 +; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo +; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %add = fsub half %x, 4.0 %fneg = fneg half %add @@ -1221,6 +1655,16 @@ ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: select_fneg_posk_src_mul_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_f16_e32 v1, -4.0, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %mul = fmul half %x, 4.0 %fneg = fneg half %mul @@ -1251,6 +1695,17 @@ ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-SAFE-LABEL: select_fneg_posk_src_fma_f16: +; GFX11-SAFE: ; %bb.0: +; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SAFE-NEXT: v_fmac_f16_e32 v2, 4.0, v1 +; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v2 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo +; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] +; ; CI-NSZ-LABEL: select_fneg_posk_src_fma_f16: ; CI-NSZ: ; %bb.0: ; CI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1271,6 +1726,16 @@ ; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NSZ-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-LABEL: select_fneg_posk_src_fma_f16: +; GFX11-NSZ: ; %bb.0: +; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NSZ-NEXT: v_fma_f16 v1, v1, -4.0, -v2 +; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo +; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fma = call half @llvm.fma.f16(half %x, half 4.0, half %z) %fneg = fneg half %fma @@ -1302,6 +1767,17 @@ ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-SAFE-LABEL: select_fneg_posk_src_fmad_f16: +; GFX11-SAFE: ; %bb.0: +; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SAFE-NEXT: v_fmac_f16_e32 v2, 4.0, v1 +; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v2 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo +; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] +; ; CI-NSZ-LABEL: select_fneg_posk_src_fmad_f16: ; CI-NSZ: ; %bb.0: ; CI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1323,6 +1799,16 @@ ; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NSZ-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-LABEL: select_fneg_posk_src_fmad_f16: +; GFX11-NSZ: ; %bb.0: +; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NSZ-NEXT: v_fma_f16 v1, v1, -4.0, -v2 +; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo +; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fmad = call half @llvm.fmuladd.f16(half %x, half 4.0, half %z) %fneg = fneg half %fmad diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=GFX11 define amdgpu_kernel void @select_f16( ; SI-LABEL: select_f16: @@ -79,6 +80,44 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: select_f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 +; GFX11-NEXT: s_mov_b32 s14, -1 +; GFX11-NEXT: s_mov_b32 s15, 0x31016000 +; GFX11-NEXT: s_mov_b32 s18, s14 +; GFX11-NEXT: s_mov_b32 s19, s15 +; GFX11-NEXT: s_mov_b32 s22, s14 +; GFX11-NEXT: s_mov_b32 s23, s15 +; GFX11-NEXT: s_mov_b32 s26, s14 +; GFX11-NEXT: s_mov_b32 s27, s15 +; GFX11-NEXT: s_mov_b32 s2, s14 +; GFX11-NEXT: s_mov_b32 s3, s15 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s16, s6 +; GFX11-NEXT: s_mov_b32 s17, s7 +; GFX11-NEXT: s_mov_b32 s20, s8 +; GFX11-NEXT: s_mov_b32 s21, s9 +; GFX11-NEXT: s_mov_b32 s24, s10 +; GFX11-NEXT: s_mov_b32 s25, s11 +; GFX11-NEXT: buffer_load_u16 v0, off, s[16:19], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[20:23], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v2, off, s[24:27], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v3, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s4 +; GFX11-NEXT: s_mov_b32 s13, s5 +; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-NEXT: buffer_store_b16 v0, off, s[12:15], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -161,6 +200,38 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: select_f16_imm_a: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_mov_b32 s22, s10 +; GFX11-NEXT: s_mov_b32 s23, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s20, s6 +; GFX11-NEXT: s_mov_b32 s21, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b, ptr addrspace(1) %c, @@ -241,6 +312,38 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: select_f16_imm_b: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_mov_b32 s22, s10 +; GFX11-NEXT: s_mov_b32 s23, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s20, s6 +; GFX11-NEXT: s_mov_b32 s21, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %c, @@ -322,6 +425,38 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: select_f16_imm_c: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_mov_b32 s22, s10 +; GFX11-NEXT: s_mov_b32 s23, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s20, s6 +; GFX11-NEXT: s_mov_b32 s21, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -403,6 +538,38 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: select_f16_imm_d: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_mov_b32 s22, s10 +; GFX11-NEXT: s_mov_b32 s23, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s20, s6 +; GFX11-NEXT: s_mov_b32 s21, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -515,6 +682,52 @@ ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: select_v2f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 +; GFX11-NEXT: s_mov_b32 s14, -1 +; GFX11-NEXT: s_mov_b32 s15, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, s14 +; GFX11-NEXT: s_mov_b32 s3, s15 +; GFX11-NEXT: s_mov_b32 s22, s14 +; GFX11-NEXT: s_mov_b32 s23, s15 +; GFX11-NEXT: s_mov_b32 s18, s14 +; GFX11-NEXT: s_mov_b32 s19, s15 +; GFX11-NEXT: s_mov_b32 s26, s14 +; GFX11-NEXT: s_mov_b32 s27, s15 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s20, s8 +; GFX11-NEXT: s_mov_b32 s21, s9 +; GFX11-NEXT: s_mov_b32 s16, s6 +; GFX11-NEXT: s_mov_b32 s17, s7 +; GFX11-NEXT: s_mov_b32 s24, s10 +; GFX11-NEXT: s_mov_b32 s25, s11 +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[20:23], 0 +; GFX11-NEXT: buffer_load_b32 v2, off, s[16:19], 0 +; GFX11-NEXT: buffer_load_b32 v3, off, s[24:27], 0 +; GFX11-NEXT: s_mov_b32 s12, s4 +; GFX11-NEXT: s_mov_b32 s13, s5 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v2, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -616,6 +829,47 @@ ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: select_v2f16_imm_a: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_mov_b32 s22, s10 +; GFX11-NEXT: s_mov_b32 s23, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s20, s6 +; GFX11-NEXT: s_mov_b32 s21, s7 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 +; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0x3900, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b, ptr addrspace(1) %c, @@ -715,6 +969,47 @@ ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: select_v2f16_imm_b: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_mov_b32 s22, s10 +; GFX11-NEXT: s_mov_b32 s23, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s20, s6 +; GFX11-NEXT: s_mov_b32 s21, s7 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 +; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0x3900, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %c, @@ -816,6 +1111,47 @@ ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: select_v2f16_imm_c: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s22, s10 +; GFX11-NEXT: s_mov_b32 s23, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s20, s6 +; GFX11-NEXT: s_mov_b32 s21, s7 +; GFX11-NEXT: buffer_load_b32 v0, off, s[16:19], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo +; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3900, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -917,6 +1253,47 @@ ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: select_v2f16_imm_d: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s22, s10 +; GFX11-NEXT: s_mov_b32 s23, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s20, s6 +; GFX11-NEXT: s_mov_b32 s21, s7 +; GFX11-NEXT: buffer_load_b32 v0, off, s[16:19], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo +; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3900, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX8 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s ; FIXME: This should be merged with uint_to_fp.ll, but s_uint_to_fp_v2i64 crashes on r600 @@ -42,6 +43,29 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm +; +; GFX11-LABEL: s_uint_to_fp_i64_to_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clz_i32_u32 s4, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_min_u32 s4, s4, 32 +; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_min_u32 s2, s2, 1 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX11-NEXT: s_sub_i32 s2, 32, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_ldexp_f32 v0, v0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = uitofp i64 %in to half store half %result, ptr addrspace(1) %out ret void @@ -98,6 +122,31 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX8-NEXT: flat_store_short v[0:1], v3 ; GFX8-NEXT: s_endpgm +; +; GFX11-LABEL: v_uint_to_fp_i64_to_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_clz_i32_u32_e32 v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_min_u32_e32 v3, 32, v3 +; GFX11-NEXT: v_lshlrev_b64 v[1:2], v3, v[1:2] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_min_u32_e32 v1, 1, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-NEXT: v_sub_nc_u32_e32 v2, 32, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX11-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid %out.gep = getelementptr half, ptr addrspace(1) %out, i32 %tid @@ -143,6 +192,27 @@ ; GFX8-NEXT: v_ldexp_f32 v2, v2, s0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm +; +; GFX11-LABEL: s_uint_to_fp_i64_to_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clz_i32_u32 s4, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_min_u32 s4, s4, 32 +; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_min_u32 s2, s2, 1 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX11-NEXT: s_sub_i32 s2, 32, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_ldexp_f32 v0, v0, s2 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = uitofp i64 %in to float store float %result, ptr addrspace(1) %out ret void @@ -197,6 +267,29 @@ ; GFX8-NEXT: v_ldexp_f32 v2, v5, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm +; +; GFX11-LABEL: v_uint_to_fp_i64_to_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_clz_i32_u32_e32 v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_min_u32_e32 v3, 32, v3 +; GFX11-NEXT: v_lshlrev_b64 v[1:2], v3, v[1:2] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_min_u32_e32 v1, 1, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-NEXT: v_sub_nc_u32_e32 v2, 32, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX11-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -258,6 +351,34 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm +; +; GFX11-LABEL: s_uint_to_fp_v2i64_to_v2f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clz_i32_u32 s2, s7 +; GFX11-NEXT: s_clz_i32_u32 s3, s5 +; GFX11-NEXT: s_min_u32 s8, s2, 32 +; GFX11-NEXT: s_min_u32 s9, s3, 32 +; GFX11-NEXT: s_lshl_b64 s[2:3], s[6:7], s8 +; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 +; GFX11-NEXT: s_min_u32 s2, s2, 1 +; GFX11-NEXT: s_min_u32 s4, s4, 1 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s5, s4 +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX11-NEXT: v_cvt_f32_u32_e32 v2, s3 +; GFX11-NEXT: s_sub_i32 s2, 32, s8 +; GFX11-NEXT: s_sub_i32 s3, 32, s9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_ldexp_f32 v1, v0, s2 +; GFX11-NEXT: v_ldexp_f32 v0, v2, s3 +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = uitofp <2 x i64> %in to <2 x float> store <2 x float> %result, ptr addrspace(1) %out ret void @@ -367,6 +488,56 @@ ; GFX8-NEXT: v_ldexp_f32 v2, v4, v12 ; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[0:3] ; GFX8-NEXT: s_endpgm +; +; GFX11-LABEL: v_uint_to_fp_v4i64_to_v4f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b128 v[1:4], v5, s[2:3] offset:16 +; GFX11-NEXT: global_load_b128 v[5:8], v5, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_clz_i32_u32_e32 v9, v4 +; GFX11-NEXT: v_clz_i32_u32_e32 v10, v2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_clz_i32_u32_e32 v11, v8 +; GFX11-NEXT: v_clz_i32_u32_e32 v12, v6 +; GFX11-NEXT: v_min_u32_e32 v9, 32, v9 +; GFX11-NEXT: v_min_u32_e32 v10, 32, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_min_u32_e32 v11, 32, v11 +; GFX11-NEXT: v_min_u32_e32 v12, 32, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshlrev_b64 v[3:4], v9, v[3:4] +; GFX11-NEXT: v_lshlrev_b64 v[1:2], v10, v[1:2] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshlrev_b64 v[7:8], v11, v[7:8] +; GFX11-NEXT: v_lshlrev_b64 v[5:6], v12, v[5:6] +; GFX11-NEXT: v_sub_nc_u32_e32 v9, 32, v9 +; GFX11-NEXT: v_sub_nc_u32_e32 v10, 32, v10 +; GFX11-NEXT: v_min_u32_e32 v3, 1, v3 +; GFX11-NEXT: v_min_u32_e32 v1, 1, v1 +; GFX11-NEXT: v_min_u32_e32 v7, 1, v7 +; GFX11-NEXT: v_min_u32_e32 v5, 1, v5 +; GFX11-NEXT: v_sub_nc_u32_e32 v11, 32, v11 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v8, v7 +; GFX11-NEXT: v_or_b32_e32 v4, v6, v5 +; GFX11-NEXT: v_sub_nc_u32_e32 v5, 32, v12 +; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3 +; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX11-NEXT: v_cvt_f32_u32_e32 v6, v2 +; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 4, v0 +; GFX11-NEXT: v_ldexp_f32 v3, v3, v9 +; GFX11-NEXT: v_ldexp_f32 v2, v1, v10 +; GFX11-NEXT: v_ldexp_f32 v1, v6, v11 +; GFX11-NEXT: v_ldexp_f32 v0, v4, v5 +; GFX11-NEXT: global_store_b128 v7, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid %out.gep = getelementptr <4 x float>, ptr addrspace(1) %out, i32 %tid @@ -435,6 +606,39 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm +; +; GFX11-LABEL: s_uint_to_fp_v2i64_to_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clz_i32_u32 s2, s7 +; GFX11-NEXT: s_clz_i32_u32 s3, s5 +; GFX11-NEXT: s_min_u32 s8, s2, 32 +; GFX11-NEXT: s_min_u32 s9, s3, 32 +; GFX11-NEXT: s_lshl_b64 s[2:3], s[6:7], s8 +; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 +; GFX11-NEXT: s_min_u32 s2, s2, 1 +; GFX11-NEXT: s_min_u32 s4, s4, 1 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s5, s4 +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX11-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX11-NEXT: s_sub_i32 s2, 32, s8 +; GFX11-NEXT: s_sub_i32 s3, 32, s9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_ldexp_f32 v0, v0, s2 +; GFX11-NEXT: v_ldexp_f32 v1, v1, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = uitofp <2 x i64> %in to <2 x half> store <2 x half> %result, ptr addrspace(1) %out ret void @@ -558,6 +762,65 @@ ; GFX8-NEXT: v_or_b32_e32 v3, v6, v5 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm +; +; GFX11-LABEL: v_uint_to_fp_v4i64_to_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b128 v[1:4], v5, s[2:3] offset:16 +; GFX11-NEXT: global_load_b128 v[5:8], v5, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_clz_i32_u32_e32 v9, v4 +; GFX11-NEXT: v_clz_i32_u32_e32 v10, v2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_clz_i32_u32_e32 v11, v8 +; GFX11-NEXT: v_clz_i32_u32_e32 v12, v6 +; GFX11-NEXT: v_min_u32_e32 v9, 32, v9 +; GFX11-NEXT: v_min_u32_e32 v10, 32, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_min_u32_e32 v11, 32, v11 +; GFX11-NEXT: v_min_u32_e32 v12, 32, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshlrev_b64 v[3:4], v9, v[3:4] +; GFX11-NEXT: v_lshlrev_b64 v[1:2], v10, v[1:2] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshlrev_b64 v[7:8], v11, v[7:8] +; GFX11-NEXT: v_lshlrev_b64 v[5:6], v12, v[5:6] +; GFX11-NEXT: v_sub_nc_u32_e32 v9, 32, v9 +; GFX11-NEXT: v_sub_nc_u32_e32 v10, 32, v10 +; GFX11-NEXT: v_min_u32_e32 v3, 1, v3 +; GFX11-NEXT: v_min_u32_e32 v1, 1, v1 +; GFX11-NEXT: v_min_u32_e32 v7, 1, v7 +; GFX11-NEXT: v_min_u32_e32 v5, 1, v5 +; GFX11-NEXT: v_sub_nc_u32_e32 v11, 32, v11 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v8, v7 +; GFX11-NEXT: v_or_b32_e32 v4, v6, v5 +; GFX11-NEXT: v_sub_nc_u32_e32 v5, 32, v12 +; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3 +; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_ldexp_f32 v3, v3, v9 +; GFX11-NEXT: v_ldexp_f32 v1, v1, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_ldexp_f32 v2, v2, v11 +; GFX11-NEXT: v_ldexp_f32 v4, v4, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v0 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3 +; GFX11-NEXT: v_pack_b32_f16 v0, v4, v2 +; GFX11-NEXT: global_store_b64 v5, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid %out.gep = getelementptr <4 x half>, ptr addrspace(1) %out, i32 %tid