diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -658,6 +658,20 @@ switch (DefMI->getOpcode()) { default: break; + case TargetOpcode::G_FADD: + case TargetOpcode::G_FSUB: + case TargetOpcode::G_FMUL: + case TargetOpcode::G_FDIV: + case TargetOpcode::G_FREM: + case TargetOpcode::G_FSIN: + case TargetOpcode::G_FCOS: + case TargetOpcode::G_FMA: + case TargetOpcode::G_FMAD: + if (SNaN) + return true; + + // TODO: Need isKnownNeverInfinity + return false; case TargetOpcode::G_FMINNUM_IEEE: case TargetOpcode::G_FMAXNUM_IEEE: { if (SNaN) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll @@ -105,8 +105,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mul_f32_e32 v0, 2.0, v0 -; GFX10-NEXT: v_med3_f32 v0, v0, 0, 1.0 +; GFX10-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] %fmul = fmul float %a, 2.0 %fmed = call float @llvm.amdgcn.fmed3.f32(float %fmul, float 0.0, float 1.0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll @@ -205,7 +205,6 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mul_f32_e32 v0, 2.0, v0 -; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX10-NEXT: v_med3_f32 v0, v0, 0, 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %fmul = fmul float %a, 2.0 @@ -222,7 +221,6 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mul_f32_e32 v0, 2.0, v0 -; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX10-NEXT: v_min_f32_e32 v0, 1.0, v0 ; GFX10-NEXT: v_max_f32_e32 v0, 0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir @@ -254,8 +254,7 @@ ; CHECK-NEXT: %one_s32:_(s32) = G_ANYEXT %one(s16) ; CHECK-NEXT: %one_undef:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %one_s32(s32), %undef(s32) ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(<2 x s16>) = G_FMUL [[COPY]], %two_splat - ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[FMUL]] - ; CHECK-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE %zero_undef, [[FCANONICALIZE]] + ; CHECK-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE %zero_undef, [[FMUL]] ; CHECK-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE %one_undef, [[FMAXNUM_IEEE]] ; CHECK-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 @@ -306,8 +305,7 @@ ; CHECK-NEXT: %qnan_undef:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %qnan_s32(s32), %undef(s32) ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(<2 x s16>) = G_FMUL [[COPY]], %two_splat ; CHECK-NEXT: %snan_undef_fcan:_(<2 x s16>) = G_FCANONICALIZE %snan_undef - ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[FMUL]] - ; CHECK-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE %snan_undef_fcan, [[FCANONICALIZE]] + ; CHECK-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE %snan_undef_fcan, [[FMUL]] ; CHECK-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE %qnan_undef, [[FMAXNUM_IEEE]] ; CHECK-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll @@ -453,8 +453,8 @@ ; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_sub_f16_e32 v3, v1, v5 ; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0 +; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-CONTRACT-LABEL: test_v4f16_sub_mul: @@ -484,13 +484,11 @@ ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX10-NEXT: v_sub_f16_e32 v2, v0, v4 -; GFX10-NEXT: v_sub_f16_e32 v3, v1, v5 ; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_sub_f16_e32 v3, v1, v5 ; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0 +; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul: @@ -532,8 +530,8 @@ ; GFX9-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_sub_f16_e32 v3, v5, v1 ; GFX9-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0 +; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-CONTRACT-LABEL: test_v4f16_sub_mul_rhs: @@ -563,13 +561,11 @@ ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX10-NEXT: v_sub_f16_e32 v2, v4, v0 -; GFX10-NEXT: v_sub_f16_e32 v3, v5, v1 ; GFX10-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_sub_f16_e32 v3, v5, v1 ; GFX10-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0 +; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul_rhs: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll @@ -241,8 +241,8 @@ ; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_sub_f16_e32 v3, v1, v5 ; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0 +; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-CONTRACT-LABEL: test_v4f16_sub_ext_neg_mul: @@ -272,13 +272,11 @@ ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] ; GFX10-NEXT: v_sub_f16_e32 v2, v0, v4 -; GFX10-NEXT: v_sub_f16_e32 v3, v1, v5 ; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_sub_f16_e32 v3, v1, v5 ; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0 +; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-CONTRACT-LABEL: test_v4f16_sub_ext_neg_mul: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -6,14 +6,14 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX8 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX8 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9,GFX9-IEEE %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9,GFX9-FLUSH %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10,GFX10-IEEE %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10,GFX10-FLUSH %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-IEEE %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FLUSH %s define half @v_fdiv_f16(half %a, half %b) { ; GFX6-IEEE-LABEL: v_fdiv_f16: @@ -771,42 +771,77 @@ ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fdiv_v2f16_afn: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_rcp_f16_e32 v2, v1 -; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: v_fdiv_v2f16_afn: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_rcp_f16_e32 v2, v1 -; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_fdiv_v2f16_afn: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_rcp_f16_e32 v1, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: v_rcp_f16_e32 v2, v2 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX11-NEXT: v_mul_f16_e32 v1, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX9-IEEE-LABEL: v_fdiv_v2f16_afn: +; GFX9-IEEE: ; %bb.0: +; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-IEEE-NEXT: v_rcp_f16_e32 v2, v1 +; GFX9-IEEE-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-IEEE-NEXT: v_mul_f16_e32 v2, v0, v2 +; GFX9-IEEE-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v2, v0 +; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLUSH-LABEL: v_fdiv_v2f16_afn: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLUSH-NEXT: v_rcp_f16_e32 v2, v1 +; GFX9-FLUSH-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-FLUSH-NEXT: v_mul_f16_e32 v2, v0, v2 +; GFX9-FLUSH-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-FLUSH-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-IEEE-LABEL: v_fdiv_v2f16_afn: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-IEEE-NEXT: v_rcp_f16_e32 v2, v1 +; GFX10-IEEE-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-IEEE-NEXT: v_mul_f16_e32 v2, v0, v2 +; GFX10-IEEE-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v2, v0 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLUSH-LABEL: v_fdiv_v2f16_afn: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-FLUSH-NEXT: v_rcp_f16_e32 v2, v1 +; GFX10-FLUSH-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-FLUSH-NEXT: v_mul_f16_e32 v2, v0, v2 +; GFX10-FLUSH-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-FLUSH-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX10-FLUSH-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-IEEE-LABEL: v_fdiv_v2f16_afn: +; GFX11-IEEE: ; %bb.0: +; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-IEEE-NEXT: v_rcp_f16_e32 v1, v1 +; GFX11-IEEE-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-IEEE-NEXT: v_rcp_f16_e32 v2, v2 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-IEEE-NEXT: v_mul_f16_e32 v1, v3, v2 +; GFX11-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FLUSH-LABEL: v_fdiv_v2f16_afn: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-FLUSH-NEXT: v_rcp_f16_e32 v1, v1 +; GFX11-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-FLUSH-NEXT: v_rcp_f16_e32 v2, v2 +; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v3, v2 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FLUSH-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn <2 x half> %a, %b ret <2 x half> %fdiv } @@ -1479,42 +1514,77 @@ ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fdiv_v2f16_afn_ulp25: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_rcp_f16_e32 v2, v1 -; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: v_fdiv_v2f16_afn_ulp25: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_rcp_f16_e32 v2, v1 -; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_fdiv_v2f16_afn_ulp25: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_rcp_f16_e32 v1, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: v_rcp_f16_e32 v2, v2 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX11-NEXT: v_mul_f16_e32 v1, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX9-IEEE-LABEL: v_fdiv_v2f16_afn_ulp25: +; GFX9-IEEE: ; %bb.0: +; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-IEEE-NEXT: v_rcp_f16_e32 v2, v1 +; GFX9-IEEE-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-IEEE-NEXT: v_mul_f16_e32 v2, v0, v2 +; GFX9-IEEE-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v2, v0 +; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLUSH-LABEL: v_fdiv_v2f16_afn_ulp25: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLUSH-NEXT: v_rcp_f16_e32 v2, v1 +; GFX9-FLUSH-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-FLUSH-NEXT: v_mul_f16_e32 v2, v0, v2 +; GFX9-FLUSH-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-FLUSH-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-IEEE-LABEL: v_fdiv_v2f16_afn_ulp25: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-IEEE-NEXT: v_rcp_f16_e32 v2, v1 +; GFX10-IEEE-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-IEEE-NEXT: v_mul_f16_e32 v2, v0, v2 +; GFX10-IEEE-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v2, v0 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLUSH-LABEL: v_fdiv_v2f16_afn_ulp25: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-FLUSH-NEXT: v_rcp_f16_e32 v2, v1 +; GFX10-FLUSH-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-FLUSH-NEXT: v_mul_f16_e32 v2, v0, v2 +; GFX10-FLUSH-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-FLUSH-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX10-FLUSH-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-IEEE-LABEL: v_fdiv_v2f16_afn_ulp25: +; GFX11-IEEE: ; %bb.0: +; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-IEEE-NEXT: v_rcp_f16_e32 v1, v1 +; GFX11-IEEE-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-IEEE-NEXT: v_rcp_f16_e32 v2, v2 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-IEEE-NEXT: v_mul_f16_e32 v1, v3, v2 +; GFX11-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FLUSH-LABEL: v_fdiv_v2f16_afn_ulp25: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-FLUSH-NEXT: v_rcp_f16_e32 v1, v1 +; GFX11-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-FLUSH-NEXT: v_rcp_f16_e32 v2, v2 +; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v3, v2 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FLUSH-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn <2 x half> %a, %b, !fpmath !0 ret <2 x half> %fdiv } @@ -1709,42 +1779,77 @@ ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fdiv_v2f16_arcp_afn_ulp25: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_rcp_f16_e32 v2, v1 -; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: v_fdiv_v2f16_arcp_afn_ulp25: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_rcp_f16_e32 v2, v1 -; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_fdiv_v2f16_arcp_afn_ulp25: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_rcp_f16_e32 v1, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: v_rcp_f16_e32 v2, v2 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX11-NEXT: v_mul_f16_e32 v1, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX9-IEEE-LABEL: v_fdiv_v2f16_arcp_afn_ulp25: +; GFX9-IEEE: ; %bb.0: +; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-IEEE-NEXT: v_rcp_f16_e32 v2, v1 +; GFX9-IEEE-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-IEEE-NEXT: v_mul_f16_e32 v2, v0, v2 +; GFX9-IEEE-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v2, v0 +; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLUSH-LABEL: v_fdiv_v2f16_arcp_afn_ulp25: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLUSH-NEXT: v_rcp_f16_e32 v2, v1 +; GFX9-FLUSH-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-FLUSH-NEXT: v_mul_f16_e32 v2, v0, v2 +; GFX9-FLUSH-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-FLUSH-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-IEEE-LABEL: v_fdiv_v2f16_arcp_afn_ulp25: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-IEEE-NEXT: v_rcp_f16_e32 v2, v1 +; GFX10-IEEE-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-IEEE-NEXT: v_mul_f16_e32 v2, v0, v2 +; GFX10-IEEE-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v2, v0 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLUSH-LABEL: v_fdiv_v2f16_arcp_afn_ulp25: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-FLUSH-NEXT: v_rcp_f16_e32 v2, v1 +; GFX10-FLUSH-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-FLUSH-NEXT: v_mul_f16_e32 v2, v0, v2 +; GFX10-FLUSH-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-FLUSH-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX10-FLUSH-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-IEEE-LABEL: v_fdiv_v2f16_arcp_afn_ulp25: +; GFX11-IEEE: ; %bb.0: +; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-IEEE-NEXT: v_rcp_f16_e32 v1, v1 +; GFX11-IEEE-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-IEEE-NEXT: v_rcp_f16_e32 v2, v2 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-IEEE-NEXT: v_mul_f16_e32 v1, v3, v2 +; GFX11-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FLUSH-LABEL: v_fdiv_v2f16_arcp_afn_ulp25: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-FLUSH-NEXT: v_rcp_f16_e32 v1, v1 +; GFX11-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-FLUSH-NEXT: v_rcp_f16_e32 v2, v2 +; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v3, v2 +; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FLUSH-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn arcp <2 x half> %a, %b, !fpmath !0 ret <2 x half> %fdiv } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll @@ -146,7 +146,6 @@ ; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_sub_f32_e32 v2, 0x80000000, v2 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_min_f32_e32 v5, v2, v3 ; SI-NEXT: v_max_f32_e32 v2, v2, v3 @@ -186,7 +185,6 @@ ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_sub_f32_e32 v4, 0x80000000, v7 ; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; VI-NEXT: v_min_f32_e32 v5, v4, v2 ; VI-NEXT: v_max_f32_e32 v2, v4, v2 ; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -208,7 +206,6 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 ; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX9-NEXT: v_min_f32_e32 v4, v1, v2 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX9-NEXT: v_max_f32_e32 v2, v3, v3 @@ -231,7 +228,6 @@ ; GFX10-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 ; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX10-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX10-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX10-NEXT: v_max_f32_e32 v4, v1, v2 ; GFX10-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX10-NEXT: v_min_f32_e32 v2, v4, v3 @@ -251,11 +247,9 @@ ; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_sub_f32 v1, 0x80000000, v1 :: v_dual_max_f32 v2, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_f32_e32 v4, v1, v2 ; GFX11-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_minmax_f32 v1, v1, v2, v4 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-fmed3-const.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-fmed3-const.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-fmed3-const.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-fmed3-const.mir @@ -242,12 +242,8 @@ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]] - ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00 - ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK-NEXT: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmed3), [[FMUL]](s32), [[COPY2]](s32), [[COPY3]](s32) - ; CHECK-NEXT: $vgpr0 = COPY [[INT]](s32) + ; CHECK-NEXT: [[AMDGPU_CLAMP:%[0-9]+]]:vgpr(s32) = G_AMDGPU_CLAMP [[FMUL]] + ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_CLAMP]](s32) %0:vgpr(s32) = COPY $vgpr0 %2:sgpr(s32) = G_FCONSTANT float 2.000000e+00 %8:vgpr(s32) = COPY %2(s32) diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll --- a/llvm/test/CodeGen/AMDGPU/v_pack.ll +++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll @@ -33,8 +33,7 @@ ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1 ; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2 -; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 ; GISEL-NEXT: ;;#ASMSTART ; GISEL-NEXT: ; use v0 ; GISEL-NEXT: ;;#ASMEND @@ -83,8 +82,7 @@ ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_subrev_f16_e32 v0, 2.0, v1 ; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2 -; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 ; GISEL-NEXT: ;;#ASMSTART ; GISEL-NEXT: ; use v0 ; GISEL-NEXT: ;;#ASMEND @@ -230,8 +228,7 @@ ; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2 ; GISEL-NEXT: v_sub_f16_e32 v0, 0x8000, v0 ; GISEL-NEXT: v_sub_f16_e32 v1, 0x8000, v1 -; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 ; GISEL-NEXT: ;;#ASMSTART ; GISEL-NEXT: ; use v0 ; GISEL-NEXT: ;;#ASMEND