Index: llvm/include/llvm/Support/KnownBits.h =================================================================== --- llvm/include/llvm/Support/KnownBits.h +++ llvm/include/llvm/Support/KnownBits.h @@ -97,6 +97,9 @@ /// Returns true if this value is known to be non-negative. bool isNonNegative() const { return Zero.isSignBitSet(); } + /// Returns true if this value is known to be positive. + bool isStrictlyPositive() const { return Zero.isSignBitSet() && !One.isNullValue(); } + /// Make this value negative. void makeNegative() { One.setSignBit(); Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4439,7 +4439,6 @@ LHSKnown = LHSKnown.trunc(24); RHSKnown = RHSKnown.trunc(24); - bool Negative = false; if (Opc == AMDGPUISD::MUL_I24) { unsigned LHSValBits = 24 - LHSKnown.countMinSignBits(); unsigned RHSValBits = 24 - RHSKnown.countMinSignBits(); @@ -4447,16 +4446,16 @@ if (MaxValBits >= 32) break; bool LHSNegative = LHSKnown.isNegative(); - bool LHSPositive = LHSKnown.isNonNegative(); + bool LHSNonNegative = LHSKnown.isNonNegative(); + bool LHSPositive = LHSKnown.isStrictlyPositive(); bool RHSNegative = RHSKnown.isNegative(); - bool RHSPositive = RHSKnown.isNonNegative(); - if ((!LHSNegative && !LHSPositive) || (!RHSNegative && !RHSPositive)) - break; - Negative = (LHSNegative && RHSPositive) || (LHSPositive && RHSNegative); - if (Negative) - Known.One.setHighBits(32 - MaxValBits); - else + bool RHSNonNegative = RHSKnown.isNonNegative(); + bool RHSPositive = RHSKnown.isStrictlyPositive(); + + if((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative)) Known.Zero.setHighBits(32 - MaxValBits); + else if((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative)) + Known.One.setHighBits(32 - MaxValBits); } else { unsigned LHSValBits = 24 - LHSKnown.countMinLeadingZeros(); unsigned RHSValBits = 24 - RHSKnown.countMinLeadingZeros(); Index: llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll @@ -0,0 +1,176 @@ +; RUN: /opt/rocm/hcc/bin/llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -O2 -amdgpu-function-calls=0 < %s | FileCheck --check-prefix=GCN %s +; GCN-NOT: v_add3_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -8 + +;@Gridwise = linkonce_odr local_unnamed_addr addrspace(3) global [1024 x float] undef, align 16 + +; Function Attrs: alwaysinline convergent norecurse nounwind +define weak_odr amdgpu_kernel void @test_kernel(float* noalias %p_in_global, float* noalias %p_wei_global, float* noalias %p_out_global) #4 { +entry: + %0 = addrspacecast float* %p_in_global to float addrspace(1)* + %1 = addrspacecast float* %p_wei_global to float addrspace(1)* + %2 = addrspacecast float addrspace(1)* %1 to float* + %3 = addrspacecast float* %p_out_global to float addrspace(1)* + %4 = addrspacecast float addrspace(1)* %3 to float* + %5 = tail call i32 @llvm.amdgcn.workitem.id.x() #28, !range !4 + %tid_y = lshr i32 %5, 4 + %tid_x = and i32 %5, 15 + + %y_div_5 = sdiv i32 %tid_y, 5 + %6 = mul nsw i32 %y_div_5, -5 + %y_mod_5 = add nsw i32 %6, %tid_y + %v1 = add nsw i32 %tid_x, %y_mod_5 + + %7 = icmp sgt i32 %v1, -2 + %spec.select.i51 = select i1 %7, i32 %v1, i32 -2 + %8 = sext i32 %spec.select.i51 to i64 + + %mul.i.i.i.i.i.i = shl nuw nsw i32 %tid_y, 7 + %mul.i36.i.i.i.i.i = shl nuw nsw i32 %tid_x, 2 + %add.i38.i.i.i.i.i = or i32 %mul.i.i.i.i.i.i, %mul.i36.i.i.i.i.i + %div400 = udiv i32 %5, 400 + + %arrayidx.i.i216.7i.i53134 = getelementptr inbounds float, float addrspace(1)* %0, i64 %8 + store float 0.000, float addrspace(1)* %arrayidx.i.i216.7i.i53134, align 4 + %9 = load float, float addrspace(1)* %arrayidx.i.i216.7i.i53134, align 4, !tbaa !5 + %add.7i.i.i.14150.i54 = add nsw i32 %spec.select.i51, 25088 + %idxprom.i.i215.7i.14151.i55 = sext i32 %add.7i.i.i.14150.i54 to i64 + %arrayidx.i.i216.7i.14152.i56135 = getelementptr inbounds float, float addrspace(1)* %0, i64 %idxprom.i.i215.7i.14151.i55 + %10 = load float, float addrspace(1)* %arrayidx.i.i216.7i.14152.i56135, align 4, !tbaa !5 + %add.7i.i.i.2.i57 = add nsw i32 %spec.select.i51, 50176 + %idxprom.i.i215.7i.2.i58 = sext i32 %add.7i.i.i.2.i57 to i64 + %arrayidx.i.i216.7i.2.i59136 = getelementptr inbounds float, float addrspace(1)* %0, i64 %idxprom.i.i215.7i.2.i58 + %11 = load float, float addrspace(1)* %arrayidx.i.i216.7i.2.i59136, align 4, !tbaa !5 + %add.7i.i.i.3.i60 = add nsw i32 %spec.select.i51, 75264 + %idxprom.i.i215.7i.3.i61 = sext i32 %add.7i.i.i.3.i60 to i64 + %arrayidx.i.i216.7i.3.i62137 = getelementptr inbounds float, float addrspace(1)* %0, i64 %idxprom.i.i215.7i.3.i61 + %12 = load float, float addrspace(1)* %arrayidx.i.i216.7i.3.i62137, align 4, !tbaa !5 + %add.7i.i.i.1.i63 = add nsw i32 %spec.select.i51, 100352 + %idxprom.i.i215.7i.1.i64 = sext i32 %add.7i.i.i.1.i63 to i64 + %arrayidx.i.i216.7i.1.i65138 = getelementptr inbounds float, float addrspace(1)* %0, i64 %idxprom.i.i215.7i.1.i64 + %13 = load float, float addrspace(1)* %arrayidx.i.i216.7i.1.i65138, align 4, !tbaa !5 + %add.7i.i.i.1.1.i66 = add nsw i32 %spec.select.i51, 125440 + %idxprom.i.i215.7i.1.1.i67 = sext i32 %add.7i.i.i.1.1.i66 to i64 + %arrayidx.i.i216.7i.1.1.i68139 = getelementptr inbounds float, float addrspace(1)* %0, i64 %idxprom.i.i215.7i.1.1.i67 + %14 = load float, float addrspace(1)* %arrayidx.i.i216.7i.1.1.i68139, align 4, !tbaa !5 + %add.7i.i.i.1.2.i69 = add nsw i32 %spec.select.i51, 150528 + %idxprom.i.i215.7i.1.2.i70 = sext i32 %add.7i.i.i.1.2.i69 to i64 + %arrayidx.i.i216.7i.1.2.i71140 = getelementptr inbounds float, float addrspace(1)* %0, i64 %idxprom.i.i215.7i.1.2.i70 + %15 = load float, float addrspace(1)* %arrayidx.i.i216.7i.1.2.i71140, align 4, !tbaa !5 + %add.7i.i.i.1.3.i72 = add nsw i32 %spec.select.i51, 175616 + %idxprom.i.i215.7i.1.3.i73 = sext i32 %add.7i.i.i.1.3.i72 to i64 + %arrayidx.i.i216.7i.1.3.i74141 = getelementptr inbounds float, float addrspace(1)* %0, i64 %idxprom.i.i215.7i.1.3.i73 + %16 = load float, float addrspace(1)* %arrayidx.i.i216.7i.1.3.i74141, align 4, !tbaa !5 + %vector_data..12i.0.vec.insert.i35 = insertelement <4 x float> undef, float %9, i32 0 + %vector_data..12i.4.vec.insert.i36 = insertelement <4 x float> %vector_data..12i.0.vec.insert.i35, float %10, i32 1 + %vector_data..12i.8.vec.insert.i37 = insertelement <4 x float> %vector_data..12i.4.vec.insert.i36, float %11, i32 2 + %vector_data..12i.12.vec.insert.i38 = insertelement <4 x float> %vector_data..12i.8.vec.insert.i37, float %12, i32 3 + ;%Gridwise = addrspacecast float addrspace(1)* %3 to float* <1024 x float>, <1024 x float] + ;%arrayidx.i.i46..12i.i40 = getelementptr inbounds [1024 x float], [1024 x float] addrspace(1)* @Gridwise, i32 0, i32 %add.i38.i.i.i.i.i + %17 = bitcast float addrspace(1)* %3 to <4 x float> addrspace(1)* + store <4 x float> %vector_data..12i.12.vec.insert.i38, <4 x float> addrspace(1)* %17, align 16, !tbaa !9 + %vector_data..12i.0.vec.insert4.i41 = insertelement <4 x float> undef, float %13, i32 0 + %vector_data..12i.4.vec.insert9.i42 = insertelement <4 x float> %vector_data..12i.0.vec.insert4.i41, float %14, i32 1 + %vector_data..12i.8.vec.insert11.i43 = insertelement <4 x float> %vector_data..12i.4.vec.insert9.i42, float %15, i32 2 + %vector_data..12i.12.vec.insert13.i44 = insertelement <4 x float> %vector_data..12i.8.vec.insert11.i43, float %16, i32 3 + %add.7i.7i.1.i45 = or i32 %add.i38.i.i.i.i.i, 64 + ;%arrayidx.i.i46..12i.1.i47 = getelementptr inbounds [1024 x float], [1024 x float] addrspace(1)* @Gridwise, i32 0, i32 %add.7i.7i.1.i45 + %18 = bitcast float addrspace(1)* %3 to <4 x float> addrspace(1)* + store <4 x float> %vector_data..12i.12.vec.insert13.i44, <4 x float> addrspace(1)* %18, align 16, !tbaa !9 + %add.i.i25.7i.7i.i.i = add nsw i32 %y_mod_5, 4 + %cmp.i.i.i12..12i = icmp slt i32 %add.i.i25.7i.7i.i.i, 5 + %sub.7i.7i.i = add i32 %y_mod_5, 2147483647 + %agg.tmp12.7i.i.i.i.sroa.0.sroa.5.0.copyload = select i1 %cmp.i.i.i12..12i, i32 %add.i.i25.7i.7i.i.i, i32 %sub.7i.7i.i + %not.cmp.i.i.i12..12i = xor i1 %cmp.i.i.i12..12i, true + %inc.i11.7i.7i = zext i1 %not.cmp.i.i.i12..12i to i32 + %19 = add nsw i32 %y_div_5, %inc.i11.7i.7i + %cmp.i16.7i.7i = icmp slt i32 %19, 5 + %sub.i21.7i.7i = add nsw i32 %19, -5 + %agg.tmp12.7i.i.i.i.sroa.0.sroa.4.0.copyload = select i1 %cmp.i16.7i.7i, i32 %19, i32 %sub.i21.7i.7i + %not.cmp.i16.7i.7i = xor i1 %cmp.i16.7i.7i, true + %inc..12i = zext i1 %not.cmp.i16.7i.7i to i32 + %agg.tmp12.7i.i.i.i.sroa.0.sroa.0.0.copyload = add nuw nsw i32 %div400, %inc..12i + %mul.7i.7i = mul nuw nsw i32 %agg.tmp12.7i.i.i.i.sroa.0.sroa.0.0.copyload, 784 + %mul.i34..12i.i = mul nsw i32 %agg.tmp12.7i.i.i.i.sroa.0.sroa.4.0.copyload, 56 + %mul.i19..12i.i = shl i32 %agg.tmp12.7i.i.i.i.sroa.0.sroa.5.0.copyload, 1 + %add.i36..12i.i = add i32 %mul.i19..12i.i, %tid_x + %add.i21..12i.i = add i32 %add.i36..12i.i, %mul.i34..12i.i + %sub.7i.i.i.i = add i32 %add.i21..12i.i, %mul.7i.7i + %20 = icmp sgt i32 %sub.7i.i.i.i, -4 + %spec.select.i = select i1 %20, i32 %sub.7i.i.i.i, i32 -4 + %idxprom.i.i215.7i.i = sext i32 %spec.select.i to i64 + %arrayidx.i.i216.7i.i142 = getelementptr inbounds float, float addrspace(1)* %0, i64 %idxprom.i.i215.7i.i + %21 = load float, float addrspace(1)* %arrayidx.i.i216.7i.i142, align 4, !tbaa !5 + %add.7i.i.i.14150.i = add nsw i32 %spec.select.i, 25088 + %idxprom.i.i215.7i.14151.i = sext i32 %add.7i.i.i.14150.i to i64 + %arrayidx.i.i216.7i.14152.i143 = getelementptr inbounds float, float addrspace(1)* %0, i64 %idxprom.i.i215.7i.14151.i + %22 = load float, float addrspace(1)* %arrayidx.i.i216.7i.14152.i143, align 4, !tbaa !5 + %add.7i.i.i.2.i = add nsw i32 %spec.select.i, 50176 + %idxprom.i.i215.7i.2.i = sext i32 %add.7i.i.i.2.i to i64 + %arrayidx.i.i216.7i.2.i144 = getelementptr inbounds float, float addrspace(1)* %0, i64 %idxprom.i.i215.7i.2.i + %23 = load float, float addrspace(1)* %arrayidx.i.i216.7i.2.i144, align 4, !tbaa !5 + %add.7i.i.i.3.i = add nsw i32 %spec.select.i, 75264 + %idxprom.i.i215.7i.3.i = sext i32 %add.7i.i.i.3.i to i64 + %arrayidx.i.i216.7i.3.i145 = getelementptr inbounds float, float addrspace(1)* %0, i64 %idxprom.i.i215.7i.3.i + %24 = load float, float addrspace(1)* %arrayidx.i.i216.7i.3.i145, align 4, !tbaa !5 + %add.7i.i.i.1.i = add nsw i32 %spec.select.i, 100352 + %idxprom.i.i215.7i.1.i = sext i32 %add.7i.i.i.1.i to i64 + %arrayidx.i.i216.7i.1.i146 = getelementptr inbounds float, float addrspace(1)* %0, i64 %idxprom.i.i215.7i.1.i + %25 = load float, float addrspace(1)* %arrayidx.i.i216.7i.1.i146, align 4, !tbaa !5 + %add.7i.i.i.1.1.i = add nsw i32 %spec.select.i, 125440 + %idxprom.i.i215.7i.1.1.i = sext i32 %add.7i.i.i.1.1.i to i64 + %arrayidx.i.i216.7i.1.1.i147 = getelementptr inbounds float, float addrspace(1)* %0, i64 %idxprom.i.i215.7i.1.1.i + %26 = load float, float addrspace(1)* %arrayidx.i.i216.7i.1.1.i147, align 4, !tbaa !5 + %add.7i.i.i.1.2.i = add nsw i32 %spec.select.i, 150528 + %idxprom.i.i215.7i.1.2.i = sext i32 %add.7i.i.i.1.2.i to i64 + %arrayidx.i.i216.7i.1.2.i148 = getelementptr inbounds float, float addrspace(1)* %0, i64 %idxprom.i.i215.7i.1.2.i + %27 = load float, float addrspace(1)* %arrayidx.i.i216.7i.1.2.i148, align 4, !tbaa !5 + %add.7i.i.i.1.3.i = add nsw i32 %spec.select.i, 175616 + %idxprom.i.i215.7i.1.3.i = sext i32 %add.7i.i.i.1.3.i to i64 + %arrayidx.i.i216.7i.1.3.i149 = getelementptr inbounds float, float addrspace(1)* %0, i64 %idxprom.i.i215.7i.1.3.i + %28 = load float, float addrspace(1)* %arrayidx.i.i216.7i.1.3.i149, align 4, !tbaa !5 + + %vector_data..12i.0.vec.insert.i = insertelement <4 x float> undef, float %21, i32 0 + %vector_data..12i.4.vec.insert.i = insertelement <4 x float> %vector_data..12i.0.vec.insert.i, float %22, i32 1 + %vector_data..12i.8.vec.insert.i = insertelement <4 x float> %vector_data..12i.4.vec.insert.i, float %23, i32 2 + %vector_data..12i.12.vec.insert.i = insertelement <4 x float> %vector_data..12i.8.vec.insert.i, float %24, i32 3 + + ;%arrayidx.i.i46..12i.i = getelementptr inbounds float, float addrspace(3)* getelementptr inbounds ([1024 x float], [1024 x float] addrspace(1)* @Gridwise, i32 0, i32 512), i32 %add.i38.i.i.i.i.i + %29 = bitcast float addrspace(1)* %3 to <4 x float> addrspace(1)* + store <4 x float> %vector_data..12i.12.vec.insert.i, <4 x float> addrspace(1)* %29, align 16, !tbaa !9 + + %vector_data..12i.0.vec.insert4.i = insertelement <4 x float> undef, float %25, i32 0 + %vector_data..12i.4.vec.insert9.i = insertelement <4 x float> %vector_data..12i.0.vec.insert4.i, float %26, i32 1 + %vector_data..12i.8.vec.insert11.i = insertelement <4 x float> %vector_data..12i.4.vec.insert9.i, float %27, i32 2 + %vector_data..12i.12.vec.insert13.i = insertelement <4 x float> %vector_data..12i.8.vec.insert11.i, float %28, i32 3 + + ;%arrayidx.i.i46..12i.1.i = getelementptr inbounds float, float addrspace(3)* getelementptr inbounds ([1024 x float], [1024 x float] addrspace(1)* @Gridwise, i32 0, i32 512), i32 %add.7i.7i.1.i45 + %out = bitcast float addrspace(1)* %3 to <4 x float> addrspace(1)* + store <4 x float> %vector_data..12i.12.vec.insert.i38, <4 x float> addrspace(1)* %out, align 16, !tbaa !9 + ret void +} + +; Function Attrs: convergent nounwind +declare void @llvm.amdgcn.s.barrier() #21 + +; Function Attrs: nounwind readnone speculatable +declare i32 @llvm.amdgcn.workitem.id.x() #20 + +; Function Attrs: nounwind readnone speculatable +declare i32 @llvm.amdgcn.workgroup.id.x() #20 + +attributes #4 = { alwaysinline convergent norecurse nounwind "HC" "amdgpu-flat-work-group-size"="1,64" "amdgpu-waves-per-eu"="2" "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+fp32-denormals,+fp64-fp16-denormals,+gfx8-insts,+gfx9-insts,+s-memrealtime" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.module.flags = !{!0, !1} +!opencl.ocl.version = !{!2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"PIC Level", i32 2} +!2 = !{i32 2, i32 0} +!3 = !{!"HCC clang version 10.0.0 (/data/jenkins_workspace/compute-rocm-rel-2.9/external/hcc-tot/clang fa40706d8ba0b8b958d42f579120eb9b89babc00) (/data/jenkins_workspace/compute-rocm-rel-2.9/external/hcc-tot/compiler b7f876231af7fdaf52e419088b8ba9e0c3a61845) (based on HCC 2.9.19392-75835c3-fa40706-b7f8762 )"} +!4 = !{i32 0, i32 1024} +!5 = !{!6, !6, i64 0} +!6 = !{!"float", !7, i64 0} +!7 = !{!"omnipotent char", !8, i64 0} +!8 = !{!"Simple C++ TBAA"} +!9 = !{!7, !7, i64 0}