Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -69,6 +69,7 @@ SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const; + bool performMul24Combine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulLoHi24Combine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performCtlzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2438,6 +2438,42 @@ return DAG.getZExtOrTrunc(Mulhi, DL, VT); } +static SDValue reduce24BitOperand(SDValue Op, SelectionDAG &DAG) { + + if (Op.getOpcode() == ISD::AND && + dyn_cast(Op.getOperand(1))->getAPIntValue() + .countLeadingZeros() == 8) + return Op.getOperand(0); + + return Op; +} + +bool AMDGPUTargetLowering::performMul24Combine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + bool Simplified = simplifyI24(N0, DCI); + Simplified |= simplifyI24(N1, DCI); + + if (Simplified) + return true; + + // simplifyI24 only works if N0 or N1 has one use, so we need to handle + // the multiple use case here. + + SDValue ReducedN0 = reduce24BitOperand(N0, DAG); + SDValue ReducedN1 = reduce24BitOperand(N1, DAG); + + if (ReducedN0 == N0 && ReducedN1 == N1) + return false; + + DAG.UpdateNodeOperands(N, ReducedN0, ReducedN1); + return true; +} + SDValue AMDGPUTargetLowering::performMulLoHi24Combine( SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -2446,7 +2482,7 @@ SDValue N1 = N->getOperand(1); // Simplify demanded bits before splitting into multiple users. - if (simplifyI24(N0, DCI) || simplifyI24(N1, DCI)) + if (performMul24Combine(N, DCI)) return SDValue(); bool Signed = (N->getOpcode() == AMDGPUISD::MUL_LOHI_I24); @@ -2629,10 +2665,7 @@ case AMDGPUISD::MUL_U24: case AMDGPUISD::MULHI_I24: case AMDGPUISD::MULHI_U24: { - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - simplifyI24(N0, DCI); - simplifyI24(N1, DCI); + performMul24Combine(N, DCI); return SDValue(); } case AMDGPUISD::MUL_LOHI_I24: Index: test/CodeGen/AMDGPU/mad_uint24.ll =================================================================== --- test/CodeGen/AMDGPU/mad_uint24.ll +++ test/CodeGen/AMDGPU/mad_uint24.ll @@ -74,3 +74,63 @@ store i32 %4, i32 addrspace(1)* %out ret void } + +; FUNC-LABEL: {{^}}extra_and: +; SI-NOT: v_and +; SI: v_mad_u32_u24 +; SI: v_mad_u32_u24 +define amdgpu_kernel void @extra_and(i32 addrspace(1)* %arg, i32 %arg2, i32 %arg3) { +bb: + br label %bb4 + +bb4: ; preds = %bb4, %bb + %tmp = phi i32 [ 0, %bb ], [ %tmp13, %bb4 ] + %tmp5 = phi i32 [ 0, %bb ], [ %tmp13, %bb4 ] + %tmp6 = phi i32 [ 0, %bb ], [ %tmp15, %bb4 ] + %tmp7 = phi i32 [ 0, %bb ], [ %tmp15, %bb4 ] + %tmp8 = and i32 %tmp7, 16777215 + %tmp9 = and i32 %tmp6, 16777215 + %tmp10 = and i32 %tmp5, 16777215 + %tmp11 = and i32 %tmp, 16777215 + %tmp12 = mul i32 %tmp8, %tmp11 + %tmp13 = add i32 %arg2, %tmp12 + %tmp14 = mul i32 %tmp9, %tmp11 + %tmp15 = add i32 %arg3, %tmp14 + %tmp16 = add nuw nsw i32 %tmp13, %tmp15 + %tmp17 = icmp eq i32 %tmp16, 8 + br i1 %tmp17, label %bb18, label %bb4 + +bb18: ; preds = %bb4 + store i32 %tmp16, i32 addrspace(1)* %arg + ret void +} + +; FUNC-LABEL: {{^}}dont_remove_shift +; SI: v_lshr +; SI: v_mad_u32_u24 +; SI: v_mad_u32_u24 +define amdgpu_kernel void @dont_remove_shift(i32 addrspace(1)* %arg, i32 %arg2, i32 %arg3) { +bb: + br label %bb4 + +bb4: ; preds = %bb4, %bb + %tmp = phi i32 [ 0, %bb ], [ %tmp13, %bb4 ] + %tmp5 = phi i32 [ 0, %bb ], [ %tmp13, %bb4 ] + %tmp6 = phi i32 [ 0, %bb ], [ %tmp15, %bb4 ] + %tmp7 = phi i32 [ 0, %bb ], [ %tmp15, %bb4 ] + %tmp8 = lshr i32 %tmp7, 8 + %tmp9 = lshr i32 %tmp6, 8 + %tmp10 = lshr i32 %tmp5, 8 + %tmp11 = lshr i32 %tmp, 8 + %tmp12 = mul i32 %tmp8, %tmp11 + %tmp13 = add i32 %arg2, %tmp12 + %tmp14 = mul i32 %tmp9, %tmp11 + %tmp15 = add i32 %arg3, %tmp14 + %tmp16 = add nuw nsw i32 %tmp13, %tmp15 + %tmp17 = icmp eq i32 %tmp16, 8 + br i1 %tmp17, label %bb18, label %bb4 + +bb18: ; preds = %bb4 + store i32 %tmp16, i32 addrspace(1)* %arg + ret void +} Index: test/CodeGen/AMDGPU/mul_uint24.ll =================================================================== --- test/CodeGen/AMDGPU/mul_uint24.ll +++ test/CodeGen/AMDGPU/mul_uint24.ll @@ -119,12 +119,10 @@ ret void } -; FIXME: Should be able to eliminate the and ; FUNC-LABEL: {{^}}test_umul24_i64_square: ; SI: s_load_dword [[A:s[0-9]+]] -; SI: s_and_b32 [[TRUNC:s[0-9]+]], [[A]], 0xffffff{{$}} -; SI-DAG: v_mul_hi_u32_u24_e64 v{{[0-9]+}}, [[TRUNC]], [[TRUNC]] -; SI-DAG: v_mul_u32_u24_e64 v{{[0-9]+}}, [[TRUNC]], [[TRUNC]] +; SI-DAG: v_mul_hi_u32_u24_e64 v{{[0-9]+}}, [[A]], [[A]] +; SI-DAG: v_mul_u32_u24_e64 v{{[0-9]+}}, [[A]], [[A]] define void @test_umul24_i64_square(i64 addrspace(1)* %out, i64 %a) { entry: %tmp0 = shl i64 %a, 40