diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll @@ -1,16 +1,25 @@ -; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX11 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX11 %s -; GCN-LABEL: {{^}}load_2dmsaa: -; GFX11: image_msaa_load v[0:3], v[0:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm ; define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) { +; GFX11-LABEL: load_2dmsaa: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_msaa_load v[0:3], v[0:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm ; encoding: [0x98,0x01,0x60,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v } -; GCN-LABEL: {{^}}load_2dmsaa_both: -; GFX11: image_msaa_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %fragid) { +; GFX11-LABEL: load_2dmsaa_both: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_msaa_load v[0:4], v[0:2], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x98,0x02,0x60,0xf0,0x00,0x00,0x60,0x00] +; GFX11-NEXT: v_mov_b32_e32 v5, 0 ; encoding: [0x80,0x02,0x0a,0x7e] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: global_store_b32 v5, v4, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x05,0x04,0x08,0x00] +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32i32.i32(i32 2, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0) %v.vec = extractvalue {<4 x float>, i32} %v, 0 @@ -19,17 +28,25 @@ ret <4 x float> %v.vec } -; GCN-LABEL: {{^}}load_2darraymsaa: -; GFX11: image_msaa_load v[0:3], v[0:3], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm ; define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %fragid) { +; GFX11-LABEL: load_2darraymsaa: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_msaa_load v[0:3], v[0:3], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm ; encoding: [0x9c,0x04,0x60,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i32(i32 4, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v } -; GCN-LABEL: {{^}}load_2darraymsaa_tfe: -; GFX11: image_msaa_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) { +; GFX11-LABEL: load_2darraymsaa_tfe: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_msaa_load v[0:4], v[0:3], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x9c,0x08,0x60,0xf0,0x00,0x00,0x20,0x00] +; GFX11-NEXT: v_mov_b32_e32 v5, 0 ; encoding: [0x80,0x02,0x0a,0x7e] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: global_store_b32 v5, v4, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x05,0x04,0x08,0x00] +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32i32.i32(i32 8, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0) %v.vec = extractvalue {<4 x float>, i32} %v, 0 @@ -38,41 +55,58 @@ ret <4 x float> %v.vec } -; GCN-LABEL: {{^}}load_2dmsaa_glc: -; GFX11: image_msaa_load v[0:3], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm glc ; define amdgpu_ps <4 x float> @load_2dmsaa_glc(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) { +; GFX11-LABEL: load_2dmsaa_glc: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_msaa_load v[0:3], v[0:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm glc ; encoding: [0x98,0x41,0x60,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 1) ret <4 x float> %v } -; GCN-LABEL: {{^}}load_2dmsaa_slc: -; GFX11: image_msaa_load v[0:3], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm slc ; define amdgpu_ps <4 x float> @load_2dmsaa_slc(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) { +; GFX11-LABEL: load_2dmsaa_slc: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_msaa_load v[0:3], v[0:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm slc ; encoding: [0x98,0x11,0x60,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 2) ret <4 x float> %v } -; GCN-LABEL: {{^}}load_2dmsaa_glc_slc: -; GFX11: image_msaa_load v[0:3], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm glc slc ; define amdgpu_ps <4 x float> @load_2dmsaa_glc_slc(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) { +; GFX11-LABEL: load_2dmsaa_glc_slc: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_msaa_load v[0:3], v[0:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm glc slc ; encoding: [0x98,0x51,0x60,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 3) ret <4 x float> %v } -; GCN-LABEL: {{^}}load_2dmsaa_d16: -; GFX11: image_msaa_load v[0:1], v[0:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm d16 ; define amdgpu_ps <4 x half> @load_2dmsaa_d16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) { +; GFX11-LABEL: load_2dmsaa_d16: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_msaa_load v[0:1], v[0:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm d16 ; encoding: [0x98,0x01,0x62,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call <4 x half> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f16.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x half> %v } -; GCN-LABEL: {{^}}load_2dmsaa_tfe_d16: -; GFX11: image_msaa_load v[0:2], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe d16 ; define amdgpu_ps <4 x half> @load_2dmsaa_tfe_d16(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %fragid) { +; GFX11-LABEL: load_2dmsaa_tfe_d16: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_msaa_load v[0:2], v[0:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe d16 ; encoding: [0x98,0x01,0x62,0xf0,0x00,0x00,0x20,0x00] +; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: global_store_b32 v3, v2, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x03,0x02,0x08,0x00] +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call {<4 x half>,i32} @llvm.amdgcn.image.msaa.load.2dmsaa.v4f16i32.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0) %v.vec = extractvalue {<4 x half>, i32} %v, 0 @@ -81,17 +115,25 @@ ret <4 x half> %v.vec } -; GCN-LABEL: {{^}}load_2darraymsaa_d16: -; GFX11: image_msaa_load v[0:1], v[0:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm d16 ; define amdgpu_ps <4 x half> @load_2darraymsaa_d16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %fragid) { +; GFX11-LABEL: load_2darraymsaa_d16: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_msaa_load v[0:1], v[0:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm d16 ; encoding: [0x9c,0x01,0x62,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call <4 x half> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f16.i32(i32 1, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x half> %v } -; GCN-LABEL: {{^}}load_2darraymsaa_tfe_d16: -; GFX11: image_msaa_load v[0:2], v[0:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe d16 ; define amdgpu_ps <4 x half> @load_2darraymsaa_tfe_d16(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) { +; GFX11-LABEL: load_2darraymsaa_tfe_d16: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_msaa_load v[0:2], v[0:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe d16 ; encoding: [0x9c,0x01,0x62,0xf0,0x00,0x00,0x20,0x00] +; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: global_store_b32 v3, v2, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x03,0x02,0x08,0x00] +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call {<4 x half>,i32} @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f16i32.i32(i32 1, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0) %v.vec = extractvalue {<4 x half>, i32} %v, 0 @@ -100,17 +142,26 @@ ret <4 x half> %v.vec } -; GCN-LABEL: {{^}}load_2dmsaa_a16: -; GFX11: image_msaa_load v[0:3], v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; define amdgpu_ps <4 x float> @load_2dmsaa_a16(<8 x i32> inreg %rsrc, i16 %s, i16 %t, i16 %fragid) { +; GFX11-LABEL: load_2dmsaa_a16: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_perm_b32 v1, v1, v0, 0x5040100 ; encoding: [0x01,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX11-NEXT: image_msaa_load v[0:3], v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; encoding: [0x98,0x01,0x61,0xf0,0x01,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i16(i32 1, i16 %s, i16 %t, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v } -; GCN-LABEL: {{^}}load_2darraymsaa_a16: -; GFX11: image_msaa_load v[0:3], v[1:2], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 ; define amdgpu_ps <4 x float> @load_2darraymsaa_a16(<8 x i32> inreg %rsrc, i16 %s, i16 %t, i16 %slice, i16 %fragid) { +; GFX11-LABEL: load_2darraymsaa_a16: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX11-NEXT: v_perm_b32 v1, v1, v0, 0x5040100 ; encoding: [0x01,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX11-NEXT: image_msaa_load v[0:3], v[1:2], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 ; encoding: [0x9c,0x04,0x61,0xf0,0x01,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i16(i32 4, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll --- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll +++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll @@ -1,21 +1,95 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s declare i32 @llvm.amdgcn.workitem.id.x() #1 declare half @llvm.fabs.f16(half) declare float @llvm.fabs.f32(float) declare double @llvm.fabs.f64(double) -; GCN-LABEL: {{^}}v_cnd_nan_nosgpr: -; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0 -; GCN: s_cselect_b64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], -1, 0 -; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]}}, -1, v{{[0-9]+}}, [[COND]] -; GCN-DAG: v{{[0-9]}} ; All nan values are converted to 0xffffffff -; GCN: s_endpgm define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { +; SI-LABEL: v_cnd_nan_nosgpr: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_cmp_eq_u32 s8, 0 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_cnd_nan_nosgpr: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_eq_u32 s2, 0 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX10-LABEL: v_cnd_nan_nosgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_cnd_nan_nosgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_eq_u32 s2, 0 +; GFX11-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 %f.gep = getelementptr float, ptr addrspace(1) %fptr, i32 %idx %f = load float, ptr addrspace(1) %f.gep @@ -25,23 +99,64 @@ ret void } - ; This requires slightly trickier SGPR operand legalization since the ; single constant bus SGPR usage is the last operand, and it should ; never be moved. ; However on GFX10 constant bus is limited to 2 scalar operands, not one. - -; GCN-LABEL: {{^}}v_cnd_nan: -; SIVI: s_cmp_eq_u32 s{{[0-9]+}}, 0 -; SIVI: s_cselect_b64 vcc, -1, 0 -; SIVI: v_cndmask_b32_e32 v{{[0-9]+}}, -1, v{{[0-9]+}}, vcc -; GFX10: s_cmp_eq_u32 s{{[0-9]+}}, 0 -; GFX10: s_cselect_b64 [[CC:s\[[0-9:]+\]]], -; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, -1, s{{[0-9]+}}, [[CC]] -; GCN-DAG: v{{[0-9]}} ; All nan values are converted to 0xffffffff -; GCN: s_endpgm define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 { +; SI-LABEL: v_cnd_nan: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_cmp_eq_u32 s2, 0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_cnd_nan: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_eq_u32 s2, 0 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX10-LABEL: v_cnd_nan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_cmp_eq_u32 s2, 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, -1, s3, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_cnd_nan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_eq_u32 s2, 0 +; GFX11-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, -1, s3, s[4:5] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %setcc = icmp ne i32 %c, 0 %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f store float %select, ptr addrspace(1) %out @@ -51,16 +166,63 @@ ; Test different compare and select operand types for optimal code ; shrinking. ; (select (cmp (sgprX, constant)), constant, sgprZ) - -; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_sgprZ_f32: -; GCN: s_load_{{dwordx2|b64}} s[[[X:[0-9]+]]:[[Z:[0-9]+]]], s[0:1], {{0x4c|0x13}} - -; SIVI-DAG: v_cmp_nlg_f32_e64 [[CC:vcc]], s[[X]], 0 -; GFX10-DAG: v_cmp_nlg_f32_e64 [[CC:s\[[0-9:]+\]]], s[[X]], 0 -; SIVI-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], s[[Z]] -; SIVI: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], [[CC]] -; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, 1.0, s[[Z]], [[CC]] define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %out, [8 x i32], float %x, float %z) #0 { +; SI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x13 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s1 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX10-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_cmp_nlg_f32_e64 s[0:1], s2, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s3, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s3, s[4:5] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext @@ -70,14 +232,63 @@ ret void } -; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_sgprX_f32: -; GCN: s_load_{{dword|b32}} [[X:s[0-9]+]] -; SIVI-DAG: v_cmp_nlg_f32_e64 [[CC:vcc]], [[X]], 0 -; GFX10-DAG: v_cmp_nlg_f32_e64 [[CC:s\[[0-9:]+\]]], [[X]], 0 -; SIVI-DAG: v_mov_b32_e32 [[VX:v[0-9]+]], [[X]] -; SIVI: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VX]], [[CC]] -; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, 1.0, [[X]], [[CC]] define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %out, float %x) #0 { +; SI-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX10-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, s[2:3] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext @@ -87,14 +298,63 @@ ret void } -; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k0_sgprZ_f32: -; GCN-DAG: s_load_{{dwordx2|b64}} s[[[X:[0-9]+]]:[[Z:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} -; SIVI-DAG: v_cmp_nlg_f32_e64 [[CC:vcc]], s[[X]], 0 -; GFX10-DAG: v_cmp_nlg_f32_e64 [[CC:s\[[0-9:]+\]]], s[[X]], 0 -; SIVI-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], s[[Z]] -; SIVI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VZ]], [[CC]] -; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, 0, s[[Z]], [[CC]] define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %out, [8 x i32], float %x, float %z) #0 { +; SI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x13 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s1 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX10-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_cmp_nlg_f32_e64 s[0:1], s2, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s3, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s3, s[4:5] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext @@ -104,14 +364,63 @@ ret void } -; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k0_sgprX_f32: -; GCN: s_load_{{dword|b32}} [[X:s[0-9]+]] -; SIVI-DAG: v_cmp_nlg_f32_e64 [[CC:vcc]], [[X]], 0 -; GFX10-DAG: v_cmp_nlg_f32_e64 [[CC:s\[[0-9:]+\]]], [[X]], 0 -; SIVI-DAG: v_mov_b32_e32 [[VX:v[0-9]+]], [[X]] -; SIVI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VX]], [[CC]] -; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, 0, [[X]], [[CC]] define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %out, float %x) #0 { +; SI-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX10-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s4, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s4, s[2:3] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext @@ -121,12 +430,80 @@ ret void } -; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k0_vgprZ_f32: -; GCN-DAG: s_load_{{dword|b32}} [[X:s[0-9]+]] -; GCN-DAG: {{buffer|flat|global}}_load_{{dword|b32}} [[Z:v[0-9]+]] -; GCN-DAG: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0 -; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 0, [[Z]], [[COND]] define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 { +; SI-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX10-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext @@ -138,12 +515,80 @@ ret void } -; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_vgprZ_f32: -; GCN-DAG: {{buffer|flat|global}}_load_{{dword|b32}} [[Z:v[0-9]+]] -; GCN-DAG: s_load_{{dword|b32}} [[X:s[0-9]+]] -; GCN-DAG: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0 -; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 1.0, [[Z]], [[COND]] define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 { +; SI-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX10-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext @@ -155,14 +600,73 @@ ret void } -; GCN-LABEL: {{^}}fcmp_vgprX_k0_select_k1_sgprZ_f32: -; GCN-DAG: {{buffer|flat|global}}_load_{{dword|b32}} [[X:v[0-9]+]] -; GCN-DAG: s_load_{{dword|b32}} [[Z:s[0-9]+]] -; GCN-DAG: v_cmp_ngt_f32_e32 vcc, 0, [[X]] -; SIVI-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] -; SIVI: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc -; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, 1.0, [[Z]], vcc define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, float %z) #0 { +; SI-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[0:1], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v2 +; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v3 +; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX10-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s0, vcc +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s0, vcc +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext @@ -174,12 +678,85 @@ ret void } -; GCN-LABEL: {{^}}fcmp_vgprX_k0_select_k1_vgprZ_f32: -; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[X:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[Z:v[0-9]+]] -; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]] -; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[Z]], vcc define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { +; SI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v2 +; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v5, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_cmp_le_f32_e32 vcc, 0, v5 +; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX10-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 0, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 0, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext @@ -193,12 +770,85 @@ ret void } -; GCN-LABEL: {{^}}icmp_vgprX_k0_select_k1_vgprZ_i32: -; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[X:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[Z:v[0-9]+]] -; GCN: v_cmp_lt_i32_e32 vcc, -1, [[X]] -; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 2, [[Z]], vcc define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { +; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 +; SI-NEXT: v_cndmask_b32_e32 v2, 2, v3, vcc +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v5, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v5 +; VI-NEXT: v_cndmask_b32_e32 v2, 2, v2, vcc +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 2, v2, vcc +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, 2, v2, vcc +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext @@ -212,13 +862,89 @@ ret void } -; GCN-LABEL: {{^}}icmp_vgprX_k0_select_k1_vgprZ_i64: -; GCN: {{buffer|flat|global}}_load_{{dwordx2|b64}} v[[[X_LO:[0-9]+]]:[[X_HI:[0-9]+]]] -; GCN-DAG: {{buffer|flat|global}}_load_{{dwordx2|b64}} v[[[Z_LO:[0-9]+]]:[[Z_HI:[0-9]+]]] -; GCN-DAG: v_cmp_lt_i64_e32 vcc, -1, v[[[X_LO]]:[[X_HI]]] -; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v[[Z_HI]], vcc -; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v[[Z_LO]], vcc define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { +; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, 2, v4, vcc +; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1] +; VI-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc +; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GFX10-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GFX11-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc +; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds i64, ptr addrspace(1) %x.ptr, i64 %tid.ext @@ -232,16 +958,102 @@ ret void } -; GCN-LABEL: {{^}}fcmp_vgprX_k0_select_vgprZ_k1_v4f32: -; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[X:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_{{dword|b128}} - -; GCN: v_cmp_nge_f32_e32 vcc, 4.0, [[X]] -; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc -; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc -; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc -; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { +; SI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; SI-NEXT: v_mov_b32_e32 v2, 0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v6 +; SI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc +; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc +; VI-NEXT: flat_load_dword v6, v[1:2] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dwordx4 v[0:3], v[3:4] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v5 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; VI-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v6 +; VI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX10-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v6, v4, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc +; GFX10-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc +; GFX10-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc +; GFX10-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc +; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v5, v1, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b128 v[0:3], v4, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc +; GFX11-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc +; GFX11-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc +; GFX11-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext @@ -255,16 +1067,102 @@ ret void } -; GCN-LABEL: {{^}}fcmp_vgprX_k0_select_k1_vgprZ_v4f32: -; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[X:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_{{dword|b128}} - -; GCN: v_cmp_ge_f32_e32 vcc, 4.0, [[X]] -; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc -; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc -; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc -; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { +; SI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; SI-NEXT: v_mov_b32_e32 v2, 0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v6 +; SI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc +; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc +; VI-NEXT: flat_load_dword v6, v[1:2] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dwordx4 v[0:3], v[3:4] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v5 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; VI-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v6 +; VI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX10-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v6, v4, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc +; GFX10-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc +; GFX10-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc +; GFX10-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc +; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v5, v1, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b128 v[0:3], v4, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc +; GFX11-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc +; GFX11-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc +; GFX11-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext @@ -280,17 +1178,102 @@ ; This must be swapped as a vector type before the condition has ; multiple uses. - -; GCN-LABEL: {{^}}fcmp_k0_vgprX_select_k1_vgprZ_v4f32: -; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[X:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_{{dword|b128}} - -; GCN: v_cmp_le_f32_e32 vcc, 4.0, [[X]] -; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc -; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc -; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc -; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { +; SI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; SI-NEXT: v_mov_b32_e32 v2, 0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v6 +; SI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc +; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc +; VI-NEXT: flat_load_dword v6, v[1:2] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dwordx4 v[0:3], v[3:4] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v5 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; VI-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v6 +; VI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX10-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v6, v4, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc +; GFX10-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc +; GFX10-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc +; GFX10-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc +; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v5, v1, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b128 v[0:3], v4, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc +; GFX11-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc +; GFX11-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc +; GFX11-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext @@ -304,16 +1287,100 @@ ret void } -; GCN-LABEL: {{^}}icmp_vgprX_k0_select_k1_vgprZ_i1: -; GCN: load_{{dword|b32}} -; GCN: load_{{ubyte|u8}} -; GCN-DAG: v_cmp_gt_i32_e32 vcc, 0, v -; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 1, -; GCN-DAG: v_cmp_eq_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, v -; GCN-DAG: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc, s{{\[[0-9]+:[0-9]+\]}} -; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, s -; GCN: store_{{byte|b8}} define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { +; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[2:3], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: v_and_b32_e32 v3, 1, v3 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 +; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v3 +; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; SI-NEXT: buffer_store_byte v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; VI-NEXT: flat_load_dword v2, v[1:2] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_ubyte v3, v[3:4] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 +; VI-NEXT: v_and_b32_e32 v3, 1, v3 +; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v3 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; VI-NEXT: flat_store_byte v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v2, v1, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_ubyte v3, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1 +; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GFX10-NEXT: global_store_byte v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v1, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_u8 v2, v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v2 +; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GFX11-NEXT: global_store_b8 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext @@ -328,16 +1395,96 @@ } ; Different types compared vs. selected -; GCN-LABEL: {{^}}fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: -; SIVI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3ff00000 -; GCN-DAG: {{buffer|flat|global}}_load_{{dword|b32}} [[X:v[0-9]+]] -; GCN-DAG: {{buffer|flat|global}}_load_{{dwordx2|b64}} - -; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]] -; SIVI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}, vcc -; GFX10-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0x3ff00000, v{{[0-9]+}}, vcc -; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { +; SI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; SI-NEXT: v_mov_b32_e32 v2, 0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[1:2], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[3:4], s[0:3], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, 0x3ff00000 +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v2 +; SI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SI-NEXT: buffer_store_dwordx2 v[0:1], v[3:4], s[4:7], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; VI-NEXT: v_lshlrev_b32_e32 v5, 3, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc +; VI-NEXT: flat_load_dword v6, v[1:2] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dwordx2 v[0:1], v[3:4] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v5 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v4, 0x3ff00000 +; VI-NEXT: v_cmp_le_f32_e32 vcc, 0, v6 +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX10-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v4, v2, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 0, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v1, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 0, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext @@ -352,14 +1499,94 @@ } ; Different types compared vs. selected -; GCN-LABEL: {{^}}fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: -; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[X:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_{{dwordx2|b64}} - -; GCN: v_cmp_nlg_f32_e32 vcc, 0, [[X]] -; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 3, v{{[0-9]+}}, vcc -; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { +; SI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; SI-NEXT: v_mov_b32_e32 v2, 0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[1:2], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[3:4], s[0:3], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v2 +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc +; SI-NEXT: buffer_store_dwordx2 v[0:1], v[3:4], s[4:7], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; VI-NEXT: v_lshlrev_b32_e32 v5, 3, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc +; VI-NEXT: flat_load_dword v6, v[1:2] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dwordx2 v[0:1], v[3:4] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v5 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v6 +; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX10-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v4, v2, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX10-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v1, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX11-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext @@ -374,13 +1601,85 @@ } ; Different types compared vs. selected -; GCN-LABEL: {{^}}icmp_vgprX_k0_selectf32_k1_vgprZ_i32: -; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[X:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[Z:v[0-9]+]] - -; GCN: v_cmp_gt_u32_e32 vcc, 2, [[X]] -; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, [[Z]], vcc define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { +; SI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: v_cmp_gt_u32_e32 vcc, 2, v2 +; SI-NEXT: v_cndmask_b32_e32 v2, 4.0, v3, vcc +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v5, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_cmp_gt_u32_e32 vcc, 2, v5 +; VI-NEXT: v_cndmask_b32_e32 v2, 4.0, v2, vcc +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX10-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc, 2, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc, 2, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext @@ -395,14 +1694,101 @@ } ; FIXME: Should be able to handle multiple uses - -; GCN-LABEL: {{^}}fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: -; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[X:v[0-9]+]] - -; GCN: v_cmp_nle_f32_e32 vcc, 4.0, [[X]] -; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -1.0, vcc -; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -2.0, vcc define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { +; SI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v2 +; SI-NEXT: v_cndmask_b32_e64 v2, v3, -1.0, vcc +; SI-NEXT: v_cndmask_b32_e64 v3, v3, -2.0, vcc +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v3, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v5, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v5 +; VI-NEXT: v_cndmask_b32_e64 v3, v2, -1.0, vcc +; VI-NEXT: v_cndmask_b32_e64 v2, v2, -2.0, vcc +; VI-NEXT: flat_store_dword v[0:1], v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_endpgm +; +; GFX10-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, -1.0, vcc +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, -2.0, vcc +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_store_dword v0, v2, s[4:5] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, -1.0, vcc +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, -2.0, vcc +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v0, v2, s[4:5] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext @@ -419,10 +1805,97 @@ } ; Source modifiers abs/neg only work for f32 - -; GCN-LABEL: {{^}}v_cndmask_abs_neg_f16: -; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { +; SI-LABEL: v_cndmask_abs_neg_f16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s8, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_cmp_lg_u32 s8, 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v1, |v0| +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_cndmask_abs_neg_f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ushort v0, v[0:1] +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v1, 0x7fff, v0 +; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX10-LABEL: v_cndmask_abs_neg_f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_cmp_lg_u32 s4, 0 +; GFX10-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_e32 v1, 0x7fff, v0 +; GFX10-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX10-NEXT: global_store_short v2, v0, s[2:3] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_cndmask_abs_neg_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v0 +; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX11-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 %f.gep = getelementptr half, ptr addrspace(1) %fptr, i32 %idx %f = load half, ptr addrspace(1) %f.gep @@ -434,9 +1907,86 @@ ret void } -; GCN-LABEL: {{^}}v_cndmask_abs_neg_f32: -; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, |v{{[0-9]+}}|, define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { +; SI-LABEL: v_cndmask_abs_neg_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_cmp_lg_u32 s8, 0 +; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_cndmask_abs_neg_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cndmask_b32_e64 v2, -v0, |v0|, s[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX10-LABEL: v_cndmask_abs_neg_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_cmp_lg_u32 s4, 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_cndmask_abs_neg_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 %f.gep = getelementptr float, ptr addrspace(1) %fptr, i32 %idx %f = load float, ptr addrspace(1) %f.gep @@ -448,10 +1998,100 @@ ret void } -; GCN-LABEL: {{^}}v_cndmask_abs_neg_f64: -; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { +; SI-LABEL: v_cndmask_abs_neg_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s8, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_cmp_lg_u32 s8, 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 +; SI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_cndmask_abs_neg_f64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 +; VI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX10-LABEL: v_cndmask_abs_neg_f64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_cmp_lg_u32 s4, 0 +; GFX10-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 +; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[2:3] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_cndmask_abs_neg_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 +; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 %f.gep = getelementptr double, ptr addrspace(1) %fptr, i32 %idx %f = load double, ptr addrspace(1) %f.gep