Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -78,6 +78,7 @@ SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -506,6 +506,7 @@ setTargetDAGCombine(ISD::FABS); setTargetDAGCombine(ISD::AssertZext); setTargetDAGCombine(ISD::AssertSext); + setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); } //===----------------------------------------------------------------------===// @@ -2771,8 +2772,16 @@ static SDValue simplifyI24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; - SDValue LHS = Node24->getOperand(0); - SDValue RHS = Node24->getOperand(1); + bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN; + + SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0); + SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1); + unsigned NewOpcode = Node24->getOpcode(); + if (IsIntrin) { + unsigned IID = cast(Node24->getOperand(0))->getZExtValue(); + NewOpcode = IID == Intrinsic::amdgcn_mul_i24 ? + AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24; + } APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24); @@ -2782,7 +2791,7 @@ SDValue DemandedLHS = DAG.GetDemandedBits(LHS, Demanded); SDValue DemandedRHS = DAG.GetDemandedBits(RHS, Demanded); if (DemandedLHS || DemandedRHS) - return DAG.getNode(Node24->getOpcode(), SDLoc(Node24), Node24->getVTList(), + return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(), DemandedLHS ? DemandedLHS : LHS, DemandedRHS ? DemandedRHS : RHS); @@ -3020,6 +3029,19 @@ return SDValue(); } + +SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine( + SDNode *N, DAGCombinerInfo &DCI) const { + unsigned IID = cast(N->getOperand(0))->getZExtValue(); + switch (IID) { + case Intrinsic::amdgcn_mul_i24: + case Intrinsic::amdgcn_mul_u24: + return simplifyI24(N, DCI); + default: + return SDValue(); + } +} + /// Split the 64-bit value \p LHS into two 32-bit components, and perform the /// binary operation \p Opc to it with the corresponding constant operands. SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl( @@ -4108,6 +4130,8 @@ case ISD::AssertZext: case ISD::AssertSext: return performAssertSZExtCombine(N, DCI); + case ISD::INTRINSIC_WO_CHAIN: + return performIntrinsicWOChainCombine(N, DCI); } return SDValue(); } Index: test/CodeGen/AMDGPU/mad_int24.ll =================================================================== --- test/CodeGen/AMDGPU/mad_int24.ll +++ test/CodeGen/AMDGPU/mad_int24.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC -; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC +; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=GCN --check-prefix=FUNC +; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=GCN --check-prefix=FUNC ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC @@ -9,8 +9,8 @@ ; Make sure we aren't masking the inputs. ; CM-NOT: AND ; CM: MULADD_INT24 -; SI-NOT: and -; SI: v_mad_i32_i24 +; GCN-NOT: and +; GCN: v_mad_i32_i24 define amdgpu_kernel void @i32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { entry: %0 = shl i32 %a, 8 @@ -22,3 +22,100 @@ store i32 %3, i32 addrspace(1)* %out ret void } + +; GCN-LABEL: {{^}}mad24_known_bits_destroyed: +; GCN: s_waitcnt +; GCN-NEXT: v_mad_i32_i24 +; GCN-NEXT: v_mul_i32_i24 +; GCN-NEXT: s_setpc_b64 +define i32 @mad24_known_bits_destroyed(i32 %a, i32 %b, i32 %c) { + + %shl.0 = shl i32 %a, 8 + %sra.0 = ashr i32 %shl.0, 8 + %shl.1 = shl i32 %b, 8 + %sra.1 = ashr i32 %shl.1, 8 + + %mul0 = mul nsw i32 %sra.0, %sra.1 + %add0 = add nsw i32 %mul0, %c + + %shl.2 = shl i32 %add0, 8 + %sra.2 = ashr i32 %shl.2, 8 + + %shl.3 = shl i32 %sra.0, 8 + %sra.3 = ashr i32 %shl.3, 8 + + %mul1 = mul nsw i32 %sra.2, %sra.3 + ret i32 %mul1 +} + +; GCN-LABEL: {{^}}mad24_intrin_known_bits_destroyed: +; GCN: s_waitcnt +; GCN-NEXT: v_mad_i32_i24 +; GCN-NEXT: v_mul_i32_i24 +; GCN-NEXT: s_setpc_b64 +define i32 @mad24_intrin_known_bits_destroyed(i32 %a, i32 %b, i32 %c) { + %shl.0 = shl i32 %a, 8 + %sra.0 = ashr i32 %shl.0, 8 + %shl.1 = shl i32 %b, 8 + %sra.1 = ashr i32 %shl.1, 8 + + %mul0 = call i32 @llvm.amdgcn.mul.i24(i32 %sra.0, i32 %sra.1) + %add0 = add nsw i32 %mul0, %c + + %shl.2 = shl i32 %add0, 8 + %sra.2 = ashr i32 %shl.2, 8 + + %shl.3 = shl i32 %sra.0, 8 + %sra.3 = ashr i32 %shl.3, 8 + + %mul1 = mul nsw i32 %sra.2, %sra.3 + ret i32 %mul1 +} + +; Make sure no unnecessary BFEs are emitted in the loop. +; GCN-LABEL: {{^}}mad24_destroyed_knownbits_2: +; GCN-NOT: v_bfe +; GCN: v_mad_i32_i24 +; GCN-NOT: v_bfe +; GCN: v_mad_i32_i24 +; GCN-NOT: v_bfe +; GCN: v_mad_i32_i24 +; GCN-NOT: v_bfe +; GCN: v_mad_i32_i24 +; GCN-NOT: v_bfe +define void @mad24_destroyed_knownbits_2(i32 %arg, i32 %arg1, i32 %arg2, i32 addrspace(1)* %arg3) { +bb: + br label %bb6 + +bb5: ; preds = %bb6 + ret void + +bb6: ; preds = %bb6, %bb + %tmp = phi i32 [ %tmp27, %bb6 ], [ 0, %bb ] + %tmp7 = phi i32 [ %arg2, %bb6 ], [ 1, %bb ] + %tmp8 = phi i32 [ %tmp26, %bb6 ], [ %arg, %bb ] + %tmp9 = shl i32 %tmp7, 8 + %tmp10 = ashr exact i32 %tmp9, 8 + %tmp11 = shl i32 %tmp8, 8 + %tmp12 = ashr exact i32 %tmp11, 8 + %tmp13 = mul nsw i32 %tmp12, %tmp10 + %tmp14 = add nsw i32 %tmp13, %tmp7 + %tmp15 = shl i32 %tmp14, 8 + %tmp16 = ashr exact i32 %tmp15, 8 + %tmp17 = mul nsw i32 %tmp16, %tmp10 + %tmp18 = add nsw i32 %tmp17, %tmp14 + %tmp19 = shl i32 %tmp18, 8 + %tmp20 = ashr exact i32 %tmp19, 8 + %tmp21 = mul nsw i32 %tmp20, %tmp16 + %tmp22 = add nsw i32 %tmp21, %tmp18 + %tmp23 = shl i32 %tmp22, 8 + %tmp24 = ashr exact i32 %tmp23, 8 + %tmp25 = mul nsw i32 %tmp24, %tmp20 + %tmp26 = add nsw i32 %tmp25, %tmp22 + store i32 %tmp26, i32 addrspace(1)* %arg3 + %tmp27 = add nuw i32 %tmp, 1 + %tmp28 = icmp eq i32 %tmp27, %arg1 + br i1 %tmp28, label %bb5, label %bb6 +} + +declare i32 @llvm.amdgcn.mul.i24(i32, i32)