Index: lib/CodeGen/SelectionDAG/LegalizeTypes.h =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -692,6 +692,7 @@ SDValue ScalarizeVecRes_VECTOR_SHUFFLE(SDNode *N); SDValue ScalarizeVecRes_SMULFIX(SDNode *N); + SDValue ScalarizeVecRes_Overflow(SDNode *N); // Vector Operand Scalarization: <1 x ty> -> ty. bool ScalarizeVectorOperand(SDNode *N, unsigned OpNo); @@ -728,6 +729,7 @@ void SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_StrictFPOp(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_Overflow(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_SMULFIX(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi); Index: lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -174,6 +174,12 @@ case ISD::SMULFIX: R = ScalarizeVecRes_SMULFIX(N); break; + case ISD::SADDO: + case ISD::UADDO: + case ISD::SSUBO: + case ISD::USUBO: + R = ScalarizeVecRes_Overflow(N); + break; } // If R is null, the sub-method took care of registering the result. @@ -204,6 +210,19 @@ Op2); } +SDValue DAGTypeLegalizer::ScalarizeVecRes_Overflow(SDNode *N) { + SDValue Op0 = GetScalarizedVector(N->getOperand(0)); + SDValue Op1 = GetScalarizedVector(N->getOperand(1)); + EVT CondTy = N->getValueType(1).getVectorElementType(); + SDValue Result = + DAG.getNode(N->getOpcode(), SDLoc(N), + DAG.getVTList(Op0.getValueType(), CondTy), Op0, Op1); + + SetScalarizedVector(SDValue(N, 0), Result); + SetScalarizedVector(SDValue(N, 1), Result.getValue(1)); + return SDValue(); +} + SDValue DAGTypeLegalizer::ScalarizeVecRes_StrictFPOp(SDNode *N) { EVT VT = N->getValueType(0).getVectorElementType(); unsigned NumOpers = N->getNumOperands(); @@ -861,6 +880,12 @@ case ISD::SMULFIX: SplitVecRes_SMULFIX(N, Lo, Hi); break; + case ISD::SADDO: + case ISD::UADDO: + case ISD::SSUBO: + case ISD::USUBO: + SplitVecRes_Overflow(N, Lo, Hi); + break; } // If Lo/Hi is null, the sub-method took care of registering results etc. @@ -912,6 +937,42 @@ Hi = DAG.getNode(Opcode, dl, LHSHi.getValueType(), LHSHi, RHSHi, Op2); } +void DAGTypeLegalizer::SplitVecRes_Overflow(SDNode *N, SDValue &, SDValue &) { + SDValue LHSLo, LHSHi; + SDValue RHSLo, RHSHi; + + bool IsSplit = + getTypeAction(N->getValueType(0)) == TargetLowering::TypeSplitVector; + if (IsSplit) { + GetSplitVector(N->getOperand(0), LHSLo, LHSHi); + GetSplitVector(N->getOperand(1), RHSLo, RHSHi); + } else { + std::tie(LHSLo, LHSHi) = DAG.SplitVectorOperand(N, 0); + std::tie(RHSLo, RHSHi) = DAG.SplitVectorOperand(N, 1); + } + + SDLoc dl(N); + + EVT BoolTy0, BoolTy1; + std::tie(BoolTy0, BoolTy1) = DAG.GetSplitDestVTs(N->getValueType(1)); + + unsigned Opcode = N->getOpcode(); + SDValue Lo = DAG.getNode( + Opcode, dl, DAG.getVTList(LHSLo.getValueType(), BoolTy0), LHSLo, RHSLo); + SDValue Hi = DAG.getNode( + Opcode, dl, DAG.getVTList(LHSHi.getValueType(), BoolTy1), LHSHi, RHSHi); + + SetSplitVector(SDValue(N, 0), Lo, Hi); + if (getTypeAction(N->getValueType(1)) == TargetLowering::TypeSplitVector) + SetSplitVector(SDValue(N, 1), Lo.getValue(1), Hi.getValue(1)); + + if (!IsSplit) { + SDValue Fixup = + DAG.getNode(ISD::CONCAT_VECTORS, dl, N->getValueType(0), Lo, Hi); + ReplaceValueWith(SDValue(N, 0), Fixup); + } +} + void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) { // We know the result is a vector. The input may be either a vector or a Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6102,7 +6102,8 @@ SDValue Op1 = getValue(I.getArgOperand(0)); SDValue Op2 = getValue(I.getArgOperand(1)); - SDVTList VTs = DAG.getVTList(Op1.getValueType(), MVT::i1); + SDVTList VTs = DAG.getVTList(Op1.getValueType(), + EVT::getEVT(I.getType()->getContainedType(1))); setValue(&I, DAG.getNode(Op, sdl, VTs, Op1, Op2)); return nullptr; } Index: test/CodeGen/AMDGPU/saddo.ll =================================================================== --- test/CodeGen/AMDGPU/saddo.ll +++ test/CodeGen/AMDGPU/saddo.ll @@ -1,11 +1,14 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs< %s + declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone + +declare { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + ; FUNC-LABEL: {{^}}saddo_i64_zext: define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind @@ -65,3 +68,22 @@ store i1 %carry, i1 addrspace(1)* %carryout ret void } + +; FUNC-LABEL: {{^}}v_saddo_v2i32: +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_add_{{[iu]}}32 +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_add_{{[iu]}}32 +define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind { + %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4 + %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4 + %sadd = call { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind + %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0 + %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1 + store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4 + %carry.ext = zext <2 x i1> %carry to <2 x i32> + store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout + ret void +} Index: test/CodeGen/AMDGPU/ssubo.ll =================================================================== --- test/CodeGen/AMDGPU/ssubo.ll +++ test/CodeGen/AMDGPU/ssubo.ll @@ -1,10 +1,11 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,SI,FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,VI,FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs< %s + declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone +declare { <2 x i32>, <2 x i1> } @llvm.ssub.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone ; FUNC-LABEL: {{^}}ssubo_i64_zext: define amdgpu_kernel void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { @@ -70,3 +71,22 @@ store i1 %carry, i1 addrspace(1)* %carryout ret void } + +; FUNC-LABEL: {{^}}v_ssubo_v2i32: +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_sub_{{[iu]}}32 +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_sub_{{[iu]}}32 +define amdgpu_kernel void @v_ssubo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind { + %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4 + %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4 + %sadd = call { <2 x i32>, <2 x i1> } @llvm.ssub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind + %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0 + %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1 + store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4 + %carry.ext = zext <2 x i1> %carry to <2 x i32> + store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout + ret void +} Index: test/CodeGen/AMDGPU/uaddo.ll =================================================================== --- test/CodeGen/AMDGPU/uaddo.ll +++ test/CodeGen/AMDGPU/uaddo.ll @@ -1,7 +1,6 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s ; FUNC-LABEL: {{^}}s_uaddo_i64_zext: ; GCN: s_add_u32 @@ -152,10 +151,32 @@ ret void } +; FUNC-LABEL: {{^}}v_uaddo_v2i32: +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_add_{{[iu]}}32 +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_add_{{[iu]}}32 +define amdgpu_kernel void @v_uaddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind { + %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4 + %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4 + %sadd = call { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind + %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0 + %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1 + store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4 + %carry.ext = zext <2 x i1> %carry to <2 x i32> + store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout + ret void +} + + declare i32 @llvm.amdgcn.workitem.id.x() #1 declare { i16, i1 } @llvm.uadd.with.overflow.i16(i16, i16) #1 declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1 declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #1 +declare { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/usubo.ll =================================================================== --- test/CodeGen/AMDGPU/usubo.ll +++ test/CodeGen/AMDGPU/usubo.ll @@ -1,7 +1,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,SICIVI,FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,SICIVI,FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s + ; FUNC-LABEL: {{^}}s_usubo_i64_zext: ; GCN: s_sub_u32 @@ -159,10 +159,28 @@ ret void } +; FUNC-LABEL: {{^}}v_usubo_v2i32: +; SICIVI: v_sub_{{[iu]}}32 +; SICIVI: v_cndmask_b32 +; SICIVI: v_sub_{{[iu]}}32 +; SICIVI: v_cndmask_b32 +define amdgpu_kernel void @v_usubo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind { + %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4 + %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4 + %sadd = call { <2 x i32>, <2 x i1> } @llvm.usub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind + %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0 + %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1 + store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4 + %carry.ext = zext <2 x i1> %carry to <2 x i32> + store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 declare { i16, i1 } @llvm.usub.with.overflow.i16(i16, i16) #1 declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #1 declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64) #1 +declare { <2 x i32>, <2 x i1> } @llvm.usub.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone attributes #0 = { nounwind } attributes #1 = { nounwind readnone }