Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -67,6 +67,42 @@ namespace { +static bool getConstantValue(SDValue N, uint32_t &Out) { + if (const ConstantSDNode *C = dyn_cast(N)) { + Out = C->getAPIntValue().getSExtValue(); + return true; + } + + if (const ConstantFPSDNode *C = dyn_cast(N)) { + Out = C->getValueAPF().bitcastToAPInt().getSExtValue(); + return true; + } + + return false; +} + +// TODO: Handle undef as zero +static SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG, + bool Negate = false) { + assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2); + uint32_t LHSVal, RHSVal; + if (getConstantValue(N->getOperand(0), LHSVal) && + getConstantValue(N->getOperand(1), RHSVal)) { + SDLoc SL(N); + uint32_t K = Negate ? + (-LHSVal & 0xffff) | (-RHSVal << 16) : + (LHSVal & 0xffff) | (RHSVal << 16); + return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0), + DAG.getTargetConstant(K, SL, MVT::i32)); + } + + return nullptr; +} + +static SDNode *packNegConstantV2I16(const SDNode *N, SelectionDAG &DAG) { + return packConstantV2I16(N, DAG, true); +} + /// AMDGPU specific code to select AMDGPU machine instructions for /// SelectionDAG operations. class AMDGPUDAGToDAGISel : public SelectionDAGISel { @@ -104,7 +140,11 @@ private: std::pair foldFrameIndex(SDValue N) const; bool isNoNanSrc(SDValue N) const; - bool isInlineImmediate(const SDNode *N) const; + bool isInlineImmediate(const SDNode *N, bool Negated = false) const; + bool isNegInlineImmediate(const SDNode *N) const { + return isInlineImmediate(N, true); + } + bool isVGPRImm(const SDNode *N) const; bool isUniformLoad(const SDNode *N) const; bool isUniformBr(const SDNode *N) const; @@ -437,14 +477,25 @@ return CurDAG->isKnownNeverNaN(N); } -bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const { +bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N, + bool Negated) const { + // TODO: Handle undef + const SIInstrInfo *TII = Subtarget->getInstrInfo(); + if (Negated) { + if (const ConstantSDNode *C = dyn_cast(N)) + return TII->isInlineConstant(-C->getAPIntValue()); - if (const ConstantSDNode *C = dyn_cast(N)) - return TII->isInlineConstant(C->getAPIntValue()); + if (const ConstantFPSDNode *C = dyn_cast(N)) + return TII->isInlineConstant(-C->getValueAPF().bitcastToAPInt()); - if (const ConstantFPSDNode *C = dyn_cast(N)) - return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt()); + } else { + if (const ConstantSDNode *C = dyn_cast(N)) + return TII->isInlineConstant(C->getAPIntValue()); + + if (const ConstantFPSDNode *C = dyn_cast(N)) + return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt()); + } return false; } @@ -563,20 +614,6 @@ llvm_unreachable("invalid vector size"); } -static bool getConstantValue(SDValue N, uint32_t &Out) { - if (const ConstantSDNode *C = dyn_cast(N)) { - Out = C->getAPIntValue().getZExtValue(); - return true; - } - - if (const ConstantFPSDNode *C = dyn_cast(N)) { - Out = C->getValueAPF().bitcastToAPInt().getZExtValue(); - return true; - } - - return false; -} - void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) { EVT VT = N->getValueType(0); unsigned NumVectorElts = VT.getVectorNumElements(); @@ -685,12 +722,8 @@ unsigned NumVectorElts = VT.getVectorNumElements(); if (VT.getScalarSizeInBits() == 16) { if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) { - uint32_t LHSVal, RHSVal; - if (getConstantValue(N->getOperand(0), LHSVal) && - getConstantValue(N->getOperand(1), RHSVal)) { - uint32_t K = LHSVal | (RHSVal << 16); - CurDAG->SelectNodeTo(N, AMDGPU::S_MOV_B32, VT, - CurDAG->getTargetConstant(K, SDLoc(N), MVT::i32)); + if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) { + ReplaceNode(N, Packed); return; } } Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -606,6 +606,25 @@ return N->getZExtValue() < 32; }]>; + +def getNegV2I16Imm : SDNodeXForm; + + +// TODO: Handle undef as 0 +def NegSubInlineConstV216 : PatLeaf<(build_vector), [{ + assert(N->getNumOperands() == 2); + assert(N->getOperand(0).getValueType().getSizeInBits() == 16); + SDValue Src0 = N->getOperand(0); + SDValue Src1 = N->getOperand(1); + if (Src0 == Src1) + return isNegInlineImmediate(Src0.getNode()); + + return (isNullConstant(Src0) && isNegInlineImmediate(Src1.getNode())) || + (isNullConstant(Src1) && isNegInlineImmediate(Src0.getNode())); +}], getNegV2I16Imm>; + //===----------------------------------------------------------------------===// // Custom Operands //===----------------------------------------------------------------------===// Index: lib/Target/AMDGPU/VOP3PInstructions.td =================================================================== --- lib/Target/AMDGPU/VOP3PInstructions.td +++ lib/Target/AMDGPU/VOP3PInstructions.td @@ -69,6 +69,17 @@ def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile, ashr_rev>; def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile, lshr_rev>; + +// Undo sub x, c -> add x, -c canonicalization since c is more likely +// an inline immediate than -c. +// The constant will be emitted as a mov, and folded later. +// TODO: We could directly encode the immediate now +def : GCNPat< + (add (v2i16 (VOP3PMods0 v2i16:$src0, i32:$src0_modifiers, i1:$clamp)), NegSubInlineConstV216:$src1), + (V_PK_SUB_U16 $src0_modifiers, $src0, SRCMODS.OP_SEL_1, NegSubInlineConstV216:$src1, $clamp) +>; + + multiclass MadFmaMixPats @chain_hi_to_lo_group_other_dep_multi_chain(i16 addrspace(3)* %ptr) { @@ -222,7 +222,7 @@ ; GCN-LABEL: {{^}}chain_hi_to_lo_private_other_dep: ; GFX900: buffer_load_short_d16_hi v1, v0, s[0:3], s4 offen ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0] +; GFX900-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] ; GFX900-NEXT: buffer_load_short_d16 v1, v0, s[0:3], s4 offen offset:2 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v0, v1 @@ -244,7 +244,7 @@ ; GFX900-NEXT: global_load_short_d16_hi v0, v[0:1], off ; GFX900-NEXT: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] +; GFX900-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] ; GFX900-NEXT: v_bfi_b32 v0, [[MASK]], v2, v0 ; GFX900-NEXT: s_setpc_b64 define <2 x i16> @chain_hi_to_lo_global_other_dep(i16 addrspace(1)* %ptr) { @@ -264,7 +264,7 @@ ; GFX900-NEXT: flat_load_short_d16_hi v0, v[0:1] ; GFX900-NEXT: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] +; GFX900-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] ; GFX900-NEXT: v_bfi_b32 v0, v1, v2, v0 ; GFX900-NEXT: s_setpc_b64 define <2 x i16> @chain_hi_to_lo_flat_other_dep(i16 addrspace(0)* %ptr) { Index: test/CodeGen/AMDGPU/shrink-add-sub-constant.ll =================================================================== --- test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -1233,7 +1233,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: s_movk_i32 s4, 0xffe0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 @@ -1243,7 +1242,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_add_u16 v2, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel_hi:[1,0] ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1296,7 +1295,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: s_mov_b32 s4, 0xffe00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 @@ -1306,7 +1304,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_add_u16 v2, v3, s4 +; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel:[0,1] op_sel_hi:[1,0] ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1362,7 +1360,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: s_mov_b32 s4, 0xffe0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 @@ -1372,7 +1369,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_add_u16 v2, v3, s4 +; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1440,7 +1437,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_add_u16 v2, v3, -16 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_u16 v2, v3, 16 op_sel_hi:[1,0] ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1502,7 +1499,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_add_u16 v2, v3, -16 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_u16 v2, v3, 16 op_sel:[0,1] op_sel_hi:[1,0] ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1567,7 +1564,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_add_u16 v2, v3, -16 +; GFX9-NEXT: v_pk_sub_u16 v2, v3, 16 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1634,7 +1631,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_add_u16 v2, v3, -4.0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_u16 v2, v3, 1.0 op_sel_hi:[1,0] ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1701,7 +1698,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_add_u16 v2, v3, 4.0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_u16 v2, v3, -1.0 op_sel_hi:[1,0] ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1768,7 +1765,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_add_u16 v2, v3, 2.0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_u16 v2, v3, -2.0 op_sel_hi:[1,0] ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1835,7 +1832,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_add_u16 v2, v3, -2.0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_u16 v2, v3, 2.0 op_sel_hi:[1,0] ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() Index: test/CodeGen/AMDGPU/sminmax.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -6,7 +6,7 @@ ; GFX9: s_load_dword [[VAL:s[0-9]+]] ; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]] ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]] -; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 +; GFX9: v_pk_sub_u16 [[ADD:v[0-9]+]], [[MAX]], -2 op_sel_hi:[1,0] ; CIVI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 ; CIVI: s_sub_i32 @@ -30,7 +30,7 @@ ; GFX9: global_load_dword [[VAL:v[0-9]+]] ; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]] ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]] -; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 +; GFX9: v_pk_sub_u16 [[ADD:v[0-9]+]], [[MAX]], -2 op_sel_hi:[1,0] ; VI: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 ; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, @@ -70,7 +70,7 @@ ; GFX9: s_load_dword [[VAL:s[0-9]+]] ; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]] ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]] -; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 +; GFX9: v_pk_sub_u16 [[ADD:v[0-9]+]], [[MAX]], -2 op_sel_hi:[1,0] define amdgpu_kernel void @s_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 { %z0 = insertelement <2 x i16> undef, i16 0, i16 0 %z1 = insertelement <2 x i16> %z0, i16 0, i16 1 @@ -88,7 +88,7 @@ ; GFX9: buffer_load_dword [[VAL:v[0-9]+]] ; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]] ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]] -; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 +; GFX9: v_pk_sub_u16 [[ADD:v[0-9]+]], [[MAX]], -2 op_sel_hi:[1,0] define amdgpu_kernel void @v_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 { %z0 = insertelement <2 x i16> undef, i16 0, i16 0 %z1 = insertelement <2 x i16> %z0, i16 0, i16 1 @@ -109,8 +109,8 @@ ; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, s[[VAL1]] ; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], s[[VAL0]], [[SUB0]] ; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], s[[VAL1]], [[SUB1]] -; GFX9-DAG: v_pk_add_u16 [[ADD0:v[0-9]+]], [[MAX0]], 2 op_sel_hi:[1,0] -; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2 op_sel_hi:[1,0] +; GFX9-DAG: v_pk_sub_u16 [[ADD0:v[0-9]+]], [[MAX0]], -2 op_sel_hi:[1,0] +; GFX9-DAG: v_pk_sub_u16 [[ADD1:v[0-9]+]], [[MAX1]], -2 op_sel_hi:[1,0] define amdgpu_kernel void @s_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %val) #0 { %z0 = insertelement <4 x i16> undef, i16 0, i16 0 %z1 = insertelement <4 x i16> %z0, i16 0, i16 1 @@ -133,11 +133,11 @@ ; GFX9-DAG: v_pk_sub_i16 [[SUB0:v[0-9]+]], 0, v[[VAL0]] ; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], v[[VAL0]], [[SUB0]] -; GFX9-DAG: v_pk_add_u16 [[ADD0:v[0-9]+]], [[MAX0]], 2 +; GFX9-DAG: v_pk_sub_u16 [[ADD0:v[0-9]+]], [[MAX0]], -2 op_sel_hi:[1,0] ; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, v[[VAL1]] ; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], v[[VAL1]], [[SUB1]] -; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2 +; GFX9-DAG: v_pk_sub_u16 [[ADD1:v[0-9]+]], [[MAX1]], -2 op_sel_hi:[1,0] define amdgpu_kernel void @v_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %src) #0 { %z0 = insertelement <4 x i16> undef, i16 0, i16 0 %z1 = insertelement <4 x i16> %z0, i16 0, i16 1