diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9622,16 +9622,7 @@ // sub x, zext (setcc) => subcarry x, 0, setcc // sub x, sext (setcc) => addcarry x, 0, setcc - - bool Commuted = false; - unsigned Opc = LHS.getOpcode(); - if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND || - Opc == ISD::ANY_EXTEND) { - std::swap(RHS, LHS); - Commuted = true; - } - - Opc = RHS.getOpcode(); + unsigned Opc = RHS.getOpcode(); switch (Opc) { default: break; case ISD::ZERO_EXTEND: @@ -9643,22 +9634,8 @@ if (!isBoolSGPR(Cond)) break; SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1); - SDValue Zero = DAG.getConstant(0, SL, MVT::i32); - SDValue Args[3]; - Args[2] = Cond; - - if (Commuted) { - // sub zext (setcc), x => addcarry 0, x, setcc - // sub sext (setcc), x => subcarry 0, x, setcc - Args[0] = Zero; - Args[1] = LHS; - Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY; - } else { - Args[0] = LHS; - Args[1] = Zero; - Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::ADDCARRY : ISD::SUBCARRY; - } - + SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond }; + Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::ADDCARRY : ISD::SUBCARRY; return DAG.getNode(Opc, SL, VTList, Args); } } diff --git a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll --- a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll @@ -127,32 +127,6 @@ ret void } -; GCN-LABEL: {{^}}sub_sube_commuted: -; GCN-DAG: v_cmp_gt_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: buffer_load_dword [[V:v[0-9]+]], -; GCN: v_addc_u32_e32 [[ADDC:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}, [[CC]] -; GCN: v_add_i32_e32 {{.*}}, 0x64, [[ADDC]] - -; GFX9-LABEL: {{^}}sub_sube_commuted: -; GFX9-DAG: v_cmp_gt_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}} -; GFX9-DAG: global_load_dword [[V:v[0-9]+]], -; GFX9: v_addc_co_u32_e32 [[ADDC:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}, [[CC]] -; GFX9: v_add_u32_e32 {{.*}}, 0x64, [[ADDC]] -define amdgpu_kernel void @sub_sube_commuted(i32 addrspace(1)* nocapture %arg, i32 %a) { -bb: - %x = tail call i32 @llvm.amdgcn.workitem.id.x() - %y = tail call i32 @llvm.amdgcn.workitem.id.y() - %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x - %v = load i32, i32 addrspace(1)* %gep, align 4 - %cmp = icmp ugt i32 %x, %y - %ext = sext i1 %cmp to i32 - %adde = add i32 %v, %ext - %sub = sub i32 %adde, %a - %sub2 = sub i32 100, %sub - store i32 %sub2, i32 addrspace(1)* %gep, align 4 - ret void -} - ; GCN-LABEL: {{^}}sube_sub: ; GCN: v_cmp_gt_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}} ; GCN: v_subb_u32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC]] @@ -309,49 +283,6 @@ ret void } -; Check case where sub is commuted with zext -; GCN-LABEL: {{^}}sub_zext_setcc_commute: -; GCN: v_cmp_gt_u32_e32 vcc, v -; GCN-NOT: vcc -; GCN: v_addc_u32_e32 [[ADDC:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc -; GCN: v_subrev_i32_e32 [[RESULT:v[0-9]+]], vcc, s{{[0-9]+}}, [[ADDC]] -define amdgpu_kernel void @sub_zext_setcc_commute(i32 addrspace(1)* nocapture %arg, i32 %a, i32%b) { -bb: - %x = tail call i32 @llvm.amdgcn.workitem.id.x() - %y = tail call i32 @llvm.amdgcn.workitem.id.y() - %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x - %v = load i32, i32 addrspace(1)* %gep, align 4 - %cmp = icmp ugt i32 %x, %y - %ext = zext i1 %cmp to i32 - %adde = sub i32 %v, %ext - %sub = sub i32 %a, %adde - %sub2 = sub i32 %sub, %b - store i32 %sub2, i32 addrspace(1)* %gep, align 4 - ret void -} - -; Check case where sub is commuted with sext -; GCN-LABEL: {{^}}sub_sext_setcc_commute: -; GCN: v_cmp_gt_u32_e32 vcc, v -; GCN-NOT: vcc -; GCN: v_subb_u32_e32 [[SUBB:v[0-9]+]], vcc, 0, v{{[0-9]+}}, vcc -; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, s{{[0-9]+}}, [[SUBB]] -; GCN: v_subrev_i32_e32 [[RESULT:v[0-9]+]], vcc, s{{[0-9]+}}, [[ADD]] -define amdgpu_kernel void @sub_sext_setcc_commute(i32 addrspace(1)* nocapture %arg, i32 %a, i32%b) { -bb: - %x = tail call i32 @llvm.amdgcn.workitem.id.x() - %y = tail call i32 @llvm.amdgcn.workitem.id.y() - %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x - %v = load i32, i32 addrspace(1)* %gep, align 4 - %cmp = icmp ugt i32 %x, %y - %ext = sext i1 %cmp to i32 - %adde = sub i32 %v, %ext - %sub = sub i32 %a, %adde - %sub2 = sub i32 %sub, %b - store i32 %sub2, i32 addrspace(1)* %gep, align 4 - ret void -} - declare i1 @llvm.amdgcn.class.f32(float, i32) #0 declare i32 @llvm.amdgcn.workitem.id.x() #0 diff --git a/llvm/test/CodeGen/AMDGPU/sub-zext-cc-zext-cc.ll b/llvm/test/CodeGen/AMDGPU/sub-zext-cc-zext-cc.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sub-zext-cc-zext-cc.ll @@ -0,0 +1,34 @@ +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs <%s | FileCheck -check-prefixes=GCN %s +; +; This test checks that we have the correct fold for zext(cc1) - zext(cc2). +; +; GCN-LABEL: sub_zext_zext: +; GCN: ds_read_b32 [[VAL:v[0-9]+]], +; GCN-DAG: v_cmp_lt_f32{{.*}} [[CC1:s\[[0-9]+:[0-9]+\]]], 0, [[VAL]] +; GCN-DAG: v_cmp_gt_f32{{.*}} vcc, 0, [[VAL]] +; GCN: v_cndmask_{{.*}} [[ZEXTCC1:v[0-9]+]], 0, 1, [[CC1]] +; GCN: v_subbrev{{.*}} {{v[0-9]+}}, vcc, 0, [[ZEXTCC1]], vcc +; +; Before the reversion that this test is attached to, the compiler commuted +; the operands to the sub and used different logic to select the addc/subc +; instruction: +; sub zext (setcc), x => addcarry 0, x, setcc +; sub sext (setcc), x => subcarry 0, x, setcc +; +; ... but that is bogus. I believe it is not possible to fold those commuted +; patterns into any form of addcarry or subcarry. + +define amdgpu_cs float @sub_zext_zext() { +.entry: + + %t519 = load float, float addrspace(3)* null + + %t524 = fcmp ogt float %t519, 0.000000e+00 + %t525 = fcmp olt float %t519, 0.000000e+00 + %t526 = zext i1 %t524 to i32 + %t527 = zext i1 %t525 to i32 + %t528 = sub nsw i32 %t526, %t527 + %t529 = sitofp i32 %t528 to float + ret float %t529 +} +