Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2703,6 +2703,25 @@ SDValue True = N->getOperand(1); SDValue False = N->getOperand(2); + if (Cond.hasOneUse()) { // TODO: Look for multiple select uses. + SelectionDAG &DAG = DCI.DAG; + if ((DAG.isConstantValueOfAnyType(True) || + DAG.isConstantValueOfAnyType(True)) && + (!DAG.isConstantValueOfAnyType(False) && + !DAG.isConstantValueOfAnyType(False))) { + // Swap cmp + select pair to move constant to false input. + // This will allow using VOPC cndmasks more often. + // select (setcc x, y), k, x -> select (setcc y, x) x, x + + SDLoc SL(N); + ISD::CondCode NewCC = getSetCCInverse(cast(CC)->get(), + VT.isInteger()); + + SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC); + return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True); + } + } + if (VT == MVT::f32 && Cond.hasOneUse()) { SDValue MinMax = CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI); Index: test/CodeGen/AMDGPU/ctlz.ll =================================================================== --- test/CodeGen/AMDGPU/ctlz.ll +++ test/CodeGen/AMDGPU/ctlz.ll @@ -19,9 +19,9 @@ ; FUNC-LABEL: {{^}}s_ctlz_i32: ; GCN: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} ; GCN-DAG: s_flbit_i32_b32 [[CTLZ:s[0-9]+]], [[VAL]] -; GCN-DAG: v_cmp_eq_u32_e64 [[CMPZ:s\[[0-9]+:[0-9]+\]]], [[VAL]], 0{{$}} +; GCN-DAG: v_cmp_ne_u32_e64 vcc, [[VAL]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 [[VCTLZ:v[0-9]+]], [[CTLZ]] -; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], [[VCTLZ]], 32, [[CMPZ]] +; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], 32, [[VCTLZ]], vcc ; GCN: buffer_store_dword [[RESULT]] ; GCN: s_endpgm @@ -36,8 +36,8 @@ ; FUNC-LABEL: {{^}}v_ctlz_i32: ; GCN: buffer_load_dword [[VAL:v[0-9]+]], ; GCN-DAG: v_ffbh_u32_e32 [[CTLZ:v[0-9]+]], [[VAL]] -; GCN-DAG: v_cmp_eq_u32_e32 vcc, 0, [[CTLZ]] -; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], [[CTLZ]], 32, vcc +; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[CTLZ]] +; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], 32, [[CTLZ]], vcc ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm @@ -142,8 +142,8 @@ ; GCN-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]] ; GCN-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[ADD]], [[CMPHI]] ; GCN-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[HI]], v[[LO]] -; GCN-DAG: v_cmp_eq_u32_e32 vcc, 0, [[OR]] -; GCN-DAG: v_cndmask_b32_e64 v[[CLTZ_LO:[0-9]+]], v[[CTLZ:[0-9]+]], 64, vcc +; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[OR]] +; GCN-DAG: v_cndmask_b32_e32 v[[CLTZ_LO:[0-9]+]], 64, v[[CTLZ:[0-9]+]], vcc ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI]]{{\]}} define void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() Index: test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll =================================================================== --- test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll +++ test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll @@ -163,15 +163,12 @@ ret void } -; FIXME: This should invert the compare and select to shrink -; operation. - ; GCN-LABEL: {{^}}add_select_negk_fabs_f32: ; GCN: buffer_load_dword [[X:v[0-9]+]] ; GCN: buffer_load_dword [[Y:v[0-9]+]] -; GCN: v_cmp_eq_u32_e64 -; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[X]], 1.0, s +; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0 +; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Y]] define void @add_select_negk_fabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef @@ -189,8 +186,8 @@ ; GCN: buffer_load_dword [[X:v[0-9]+]] ; GCN: buffer_load_dword [[Y:v[0-9]+]] -; GCN-DAG: v_cmp_eq_u32_e64 -; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[X]], [[K]], vcc +; GCN-DAG: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0 +; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Y]] define void @add_select_negliteralk_fabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef @@ -225,8 +222,8 @@ ; GCN: buffer_load_dword [[X:v[0-9]+]] ; GCN: buffer_load_dword [[Y:v[0-9]+]] -; GCN: v_cmp_eq_u32_e64 -; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[X]], 1.0, s +; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0 +; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Y]] define void @add_select_posk_fabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef @@ -460,8 +457,8 @@ ; GCN: buffer_load_dword [[X:v[0-9]+]] ; GCN: buffer_load_dword [[Y:v[0-9]+]] -; GCN: v_cmp_eq_u32_e64 -; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[X]], 1.0, s +; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0 +; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] define void @add_select_negk_fneg_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef @@ -495,8 +492,8 @@ ; GCN: buffer_load_dword [[X:v[0-9]+]] ; GCN: buffer_load_dword [[Y:v[0-9]+]] -; GCN: v_cmp_eq_u32_e64 -; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[X]], -1.0, s +; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0 +; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] define void @add_select_posk_fneg_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef @@ -665,8 +662,8 @@ ; GCN: buffer_load_dword [[X:v[0-9]+]] ; GCN: buffer_load_dword [[Y:v[0-9]+]] -; GCN: v_cmp_eq_u32_e64 -; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[X]], 4.0, s +; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0 +; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 4.0, [[X]], vcc ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]] define void @mul_select_posk_negfabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef Index: test/CodeGen/AMDGPU/select.f16.ll =================================================================== --- test/CodeGen/AMDGPU/select.f16.ll +++ test/CodeGen/AMDGPU/select.f16.ll @@ -102,12 +102,12 @@ ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; SI: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]] -; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] -; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]] +; SI: v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] +; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[C_F32]], v[[D_F32]], vcc ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] +; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] ; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x3800{{$}} -; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc +; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[C_F16]], v[[D_F16]], vcc ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define void @select_f16_imm_c( @@ -262,12 +262,18 @@ ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 -; SI: v_cmp_lt_f32_e32 -; SI: v_cmp_lt_f32_e64 -; VI: v_cmp_lt_f16_e32 -; VI: v_cmp_lt_f16_e64 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e64 + +; SI: v_cmp_lt_f32_e32 +; SI: v_cmp_lt_f32_e64 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e64 + +; VI: v_cmp_nlt_f16_e32 +; VI: v_cndmask_b32_e32 + +; VI: v_cmp_nlt_f16_e32 +; VI: v_cndmask_b32_e32 + ; SI: v_cvt_f16_f32_e32 ; SI: v_cvt_f16_f32_e32 ; GCN: s_endpgm Index: test/CodeGen/AMDGPU/v_cndmask.ll =================================================================== --- test/CodeGen/AMDGPU/v_cndmask.ll +++ test/CodeGen/AMDGPU/v_cndmask.ll @@ -1,17 +1,18 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare i32 @llvm.amdgcn.workitem.id.x() #1 -; SI-LABEL: {{^}}v_cnd_nan_nosgpr: -; SI: v_cndmask_b32_e64 v{{[0-9]}}, v{{[0-9]}}, -1, s{{\[[0-9]+:[0-9]+\]}} -; SI-DAG: v{{[0-9]}} +; GCN-LABEL: {{^}}v_cnd_nan_nosgpr: +; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0 +; GCN: v_cndmask_b32_e32 v{{[0-9]}}, -1, v{{[0-9]+}}, vcc +; GCN-DAG: v{{[0-9]}} ; All nan values are converted to 0xffffffff -; SI: s_endpgm +; GCN: s_endpgm define void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 { %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 %f.gep = getelementptr float, float addrspace(1)* %fptr, i32 %idx - %f = load float, float addrspace(1)* %fptr + %f = load float, float addrspace(1)* %f.gep %setcc = icmp ne i32 %c, 0 %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f store float %select, float addrspace(1)* %out @@ -23,11 +24,12 @@ ; single constant bus SGPR usage is the last operand, and it should ; never be moved. -; SI-LABEL: {{^}}v_cnd_nan: -; SI: v_cndmask_b32_e64 v{{[0-9]}}, v{{[0-9]}}, -1, s{{\[[0-9]+:[0-9]+\]}} -; SI-DAG: v{{[0-9]}} +; GCN-LABEL: {{^}}v_cnd_nan: +; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0 +; GCN: v_cndmask_b32_e32 v{{[0-9]}}, -1, v{{[0-9]}}, vcc +; GCN-DAG: v{{[0-9]}} ; All nan values are converted to 0xffffffff -; SI: s_endpgm +; GCN: s_endpgm define void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 { %setcc = icmp ne i32 %c, 0 %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f @@ -35,5 +37,327 @@ ret void } +; Test different compare and select operand types for optimal code +; shrinking. +; (select (cmp (sgprX, constant)), constant, sgprZ) + +; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_sgprZ_f32: +; GCN: s_load_dword [[X:s[0-9]+]] +; GCN: s_load_dword [[Z:s[0-9]+]] +; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0 +; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc +define void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %setcc = fcmp one float %x, 0.0 + %select = select i1 %setcc, float 1.0, float %z + store float %select, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_sgprX_f32: +; GCN: s_load_dword [[X:s[0-9]+]] +; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0 +; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc +define void @fcmp_sgprX_k0_select_k1_sgprX_f32(float addrspace(1)* %out, float %x) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %setcc = fcmp one float %x, 0.0 + %select = select i1 %setcc, float 1.0, float %x + store float %select, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k0_sgprZ_f32: +; GCN: s_load_dword [[X:s[0-9]+]] +; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0 +; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VZ]], vcc +define void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %setcc = fcmp one float %x, 0.0 + %select = select i1 %setcc, float 0.0, float %z + store float %select, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k0_sgprX_f32: +; GCN: s_load_dword [[X:s[0-9]+]] +; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0 +; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VZ]], vcc +define void @fcmp_sgprX_k0_select_k0_sgprX_f32(float addrspace(1)* %out, float %x) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %setcc = fcmp one float %x, 0.0 + %select = select i1 %setcc, float 0.0, float %x + store float %select, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k0_vgprZ_f32: +; GCN-DAG: s_load_dword [[X:s[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_dword [[Z:v[0-9]+]] +; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0 +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[Z]], vcc +define void @fcmp_sgprX_k0_select_k0_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %z = load float, float addrspace(1)* %z.gep + %setcc = fcmp one float %x, 0.0 + %select = select i1 %setcc, float 0.0, float %z + store float %select, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_vgprZ_f32: +; GCN-DAG: {{buffer|flat}}_load_dword [[Z:v[0-9]+]] +; GCN-DAG: s_load_dword [[X:s[0-9]+]] +; GCN: v_cmp_nlg_f32_e64 vcc, [[X]], 0 +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[Z]], vcc +define void @fcmp_sgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %z = load float, float addrspace(1)* %z.gep + %setcc = fcmp one float %x, 0.0 + %select = select i1 %setcc, float 1.0, float %z + store float %select, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}fcmp_vgprX_k0_select_k1_sgprZ_f32: +; GCN-DAG: {{buffer|flat}}_load_dword [[X:v[0-9]+]] +; GCN-DAG: s_load_dword [[Z:s[0-9]+]] +; GCN-DAG: v_cmp_ngt_f32_e32 vcc, 0, [[X]] +; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc +define void @fcmp_vgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float %z) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %x = load float, float addrspace(1)* %x.gep + %setcc = fcmp olt float %x, 0.0 + %select = select i1 %setcc, float 1.0, float %z + store float %select, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}fcmp_vgprX_k0_select_k1_vgprZ_f32: +; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[Z:v[0-9]+]] +; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]] +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[Z]], vcc +define void @fcmp_vgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext + %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %x = load volatile float, float addrspace(1)* %x.gep + %z = load volatile float, float addrspace(1)* %z.gep + %setcc = fcmp ult float %x, 0.0 + %select = select i1 %setcc, float 1.0, float %z + store float %select, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}icmp_vgprX_k0_select_k1_vgprZ_i32: +; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[Z:v[0-9]+]] +; GCN: v_cmp_lt_i32_e32 vcc, -1, [[X]] +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 2, [[Z]], vcc +define void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i32 addrspace(1)* %z.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext + %z.gep = getelementptr inbounds i32, i32 addrspace(1)* %z.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext + %x = load volatile i32, i32 addrspace(1)* %x.gep + %z = load volatile i32, i32 addrspace(1)* %z.gep + %setcc = icmp slt i32 %x, 0 + %select = select i1 %setcc, i32 2, i32 %z + store i32 %select, i32 addrspace(1)* %out.gep + ret void +} + +; FIXME: Why does VI make the wrong regalloc choice? +; GCN-LABEL: {{^}}icmp_vgprX_k0_select_k1_vgprZ_i64: +; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[X_LO:[0-9]+]]:[[X_HI:[0-9]+]]{{\]}} +; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[Z_LO:[0-9]+]]:[[Z_HI:[0-9]+]]{{\]}} +; SI-DAG: v_cmp_lt_i64_e32 vcc, -1, v{{\[}}[[X_LO]]:[[X_HI]]{{\]}} +; SI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v[[Z_HI]], vcc +; SI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v[[Z_LO]], vcc + +; VI-DAG: v_cmp_lt_i64_e64 s{{\[[0-9]+:[0-9]+\]}}, -1, v{{\[}}[[X_LO]]:[[X_HI]]{{\]}} +; VI-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v[[Z_HI]], s +; VI-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 2, v[[Z_LO]], s +define void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %x.gep = getelementptr inbounds i64, i64 addrspace(1)* %x.ptr, i64 %tid.ext + %z.gep = getelementptr inbounds i64, i64 addrspace(1)* %z.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %tid.ext + %x = load volatile i64, i64 addrspace(1)* %x.gep + %z = load volatile i64, i64 addrspace(1)* %z.gep + %setcc = icmp slt i64 %x, 0 + %select = select i1 %setcc, i64 2, i64 %z + store i64 %select, i64 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}fcmp_vgprX_k0_select_vgprZ_k1_v4f32: +; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dwordx4 + +; GCN: v_cmp_nge_f32_e32 vcc, 4.0, [[X]] +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc +define void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext + %z.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %z.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext + %x = load volatile float, float addrspace(1)* %x.gep + %z = load volatile <4 x float>, <4 x float> addrspace(1)* %z.gep + %setcc = fcmp ugt float %x, 4.0 + %select = select i1 %setcc, <4 x float> %z, <4 x float> + store <4 x float> %select, <4 x float> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}fcmp_vgprX_k0_select_k1_vgprZ_v4f32: +; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dwordx4 + +; GCN: v_cmp_ge_f32_e32 vcc, 4.0, [[X]] +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc +define void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext + %z.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %z.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext + %x = load volatile float, float addrspace(1)* %x.gep + %z = load volatile <4 x float>, <4 x float> addrspace(1)* %z.gep + %setcc = fcmp ugt float %x, 4.0 + %select = select i1 %setcc, <4 x float> , <4 x float> %z + store <4 x float> %select, <4 x float> addrspace(1)* %out.gep + ret void +} + +; This must be swapped as a vector type before the condition has +; multiple uses. + +; GCN-LABEL: {{^}}fcmp_k0_vgprX_select_k1_vgprZ_v4f32: +; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dwordx4 + +; GCN: v_cmp_le_f32_e32 vcc, 4.0, [[X]] +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc +define void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext + %z.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %z.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext + %x = load volatile float, float addrspace(1)* %x.gep + %z = load volatile <4 x float>, <4 x float> addrspace(1)* %z.gep + %setcc = fcmp ugt float 4.0, %x + %select = select i1 %setcc, <4 x float> , <4 x float> %z + store <4 x float> %select, <4 x float> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}icmp_vgprX_k0_select_k1_vgprZ_i1: +; GCN: load_dword +; GCN: load_ubyte +; GCN-DAG: v_cmp_gt_i32_e64 s{{\[[0-9]+:[0-9]+\]}}, 0, v +; DCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 1, +; GCN-DAG: v_cmp_eq_u32_e32 vcc, 1, v +; GCN-DAG: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, vcc +; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, s +; GCN: store_byte +define void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i1 addrspace(1)* %z.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext + %z.gep = getelementptr inbounds i1, i1 addrspace(1)* %z.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds i1, i1 addrspace(1)* %out, i64 %tid.ext + %x = load volatile i32, i32 addrspace(1)* %x.gep + %z = load volatile i1, i1 addrspace(1)* %z.gep + %setcc = icmp slt i32 %x, 0 + %select = select i1 %setcc, i1 true, i1 %z + store i1 %select, i1 addrspace(1)* %out.gep + ret void +} + +; Different types compared vs. selected +; GCN-LABEL: {{^}}fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: +; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dwordx2 + +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3ff00000 +; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]] +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}, vcc +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc +define void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(double addrspace(1)* %out, float addrspace(1)* %x.ptr, double addrspace(1)* %z.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext + %z.gep = getelementptr inbounds double, double addrspace(1)* %z.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext + %x = load volatile float, float addrspace(1)* %x.gep + %z = load volatile double, double addrspace(1)* %z.gep + %setcc = fcmp ult float %x, 0.0 + %select = select i1 %setcc, double 1.0, double %z + store double %select, double addrspace(1)* %out.gep + ret void +} + +; FIXME: Should be able to handle multiple uses + +; GCN-LABEL: {{^}}fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: +; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]] + +; GCN: v_cmp_nle_f32_e32 vcc, 4.0, [[X]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -1.0, vcc +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -2.0, vcc +define void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext + %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %x = load volatile float, float addrspace(1)* %x.gep + %z = load volatile float, float addrspace(1)* %z.gep + %setcc = fcmp ugt float 4.0, %x + %select0 = select i1 %setcc, float -1.0, float %z + %select1 = select i1 %setcc, float -2.0, float %z + store volatile float %select0, float addrspace(1)* %out.gep + store volatile float %select1, float addrspace(1)* %out.gep + ret void +} + attributes #0 = { nounwind } attributes #1 = { nounwind readnone }