Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1175,7 +1175,7 @@ // This is a shortcut for integer division because we have fast i32<->f32 // conversions, and fast f32 reciprocal instructions. The fractional part of a -// float is enough to accurately represent up to a 24-bit integer. +// float is enough to accurately represent up to a 24-bit signed integer. SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool Sign) const { SDLoc DL(Op); @@ -1185,10 +1185,22 @@ MVT IntVT = MVT::i32; MVT FltVT = MVT::f32; - ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP; - ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT; + unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS); + if (LHSSignBits < 9) + return SDValue(); + + unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS); + if (RHSSignBits < 9) + return SDValue(); unsigned BitSize = VT.getSizeInBits(); + unsigned SignBits = std::min(LHSSignBits, RHSSignBits); + unsigned DivBits = BitSize - SignBits; + if (Sign) + ++DivBits; + + ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP; + ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT; SDValue jq = DAG.getConstant(1, DL, IntVT); @@ -1252,6 +1264,18 @@ SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS); Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem); + // Truncate to number of bits this divide really is. + if (Sign) { + SDValue InRegSize + = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits)); + Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize); + Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize); + } else { + SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT); + Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask); + Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask); + } + return DAG.getMergeValues({ Div, Rem }, DL); } @@ -1344,19 +1368,14 @@ return DAG.getMergeValues(Results, DL); } - SDValue Num = Op.getOperand(0); - SDValue Den = Op.getOperand(1); - if (VT == MVT::i32) { - if (DAG.MaskedValueIsZero(Num, APInt::getHighBitsSet(32, 8)) && - DAG.MaskedValueIsZero(Den, APInt::getHighBitsSet(32, 8))) { - // TODO: We technically could do this for i64, but shouldn't that just be - // handled by something generally reducing 64-bit division on 32-bit - // values to 32-bit? - return LowerDIVREM24(Op, DAG, false); - } + if (SDValue Res = LowerDIVREM24(Op, DAG, false)) + return Res; } + SDValue Num = Op.getOperand(0); + SDValue Den = Op.getOperand(1); + // RCP = URECIP(Den) = 2^32 / Den + e // e is rounding error. SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den); @@ -1464,11 +1483,11 @@ SDValue Zero = DAG.getConstant(0, DL, VT); SDValue NegOne = DAG.getConstant(-1, DL, VT); - if (VT == MVT::i32 && - DAG.ComputeNumSignBits(LHS) > 8 && - DAG.ComputeNumSignBits(RHS) > 8) { - return LowerDIVREM24(Op, DAG, true); + if (VT == MVT::i32) { + if (SDValue Res = LowerDIVREM24(Op, DAG, true)) + return Res; } + if (VT == MVT::i64 && DAG.ComputeNumSignBits(LHS) > 32 && DAG.ComputeNumSignBits(RHS) > 32) { Index: test/CodeGen/AMDGPU/sdiv.ll =================================================================== --- test/CodeGen/AMDGPU/sdiv.ll +++ test/CodeGen/AMDGPU/sdiv.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s @@ -82,6 +82,60 @@ ret void } +; FUNC-LABEL: {{^}}v_sdiv_i8: +; SI: v_rcp_f32 +; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 8 +; SI: buffer_store_dword [[BFE]] +define void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { + %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 + %num = load i8, i8 addrspace(1) * %in + %den = load i8, i8 addrspace(1) * %den_ptr + %result = sdiv i8 %num, %den + %result.ext = sext i8 %result to i32 + store i32 %result.ext, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_sdiv_i23: +; SI: v_rcp_f32 +; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 23 +; SI: buffer_store_dword [[BFE]] +define void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) { + %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1 + %num = load i23, i23 addrspace(1) * %in + %den = load i23, i23 addrspace(1) * %den_ptr + %result = sdiv i23 %num, %den + %result.ext = sext i23 %result to i32 + store i32 %result.ext, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_sdiv_i24: +; SI: v_rcp_f32 +; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 24 +; SI: buffer_store_dword [[BFE]] +define void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) { + %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1 + %num = load i24, i24 addrspace(1) * %in + %den = load i24, i24 addrspace(1) * %den_ptr + %result = sdiv i24 %num, %den + %result.ext = sext i24 %result to i32 + store i32 %result.ext, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_sdiv_i25: +; SI-NOT: v_rcp_f32 +define void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) { + %den_ptr = getelementptr i25, i25 addrspace(1)* %in, i25 1 + %num = load i25, i25 addrspace(1) * %in + %den = load i25, i25 addrspace(1) * %den_ptr + %result = sdiv i25 %num, %den + %result.ext = sext i25 %result to i32 + store i32 %result.ext, i32 addrspace(1)* %out + ret void +} + ; Tests for 64-bit divide bypass. ; define void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { ; %result = sdiv i64 %a, %b Index: test/CodeGen/AMDGPU/sdivrem24.ll =================================================================== --- test/CodeGen/AMDGPU/sdivrem24.ll +++ test/CodeGen/AMDGPU/sdivrem24.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s @@ -181,13 +181,13 @@ ret void } -; FUNC-LABEL: {{^}}srem25_i32: +; FUNC-LABEL: {{^}}no_srem25_i32: ; SI-NOT: v_cvt_f32_i32 ; SI-NOT: v_rcp_f32 ; EG-NOT: INT_TO_FLT ; EG-NOT: RECIP_IEEE -define void @srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define void @no_srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 @@ -200,40 +200,138 @@ ret void } -; FUNC-LABEL: {{^}}test_no_srem24_i32_1: +; FUNC-LABEL: {{^}}no_sdiv25_i24_i25_i32: ; SI-NOT: v_cvt_f32_i32 ; SI-NOT: v_rcp_f32 ; EG-NOT: INT_TO_FLT ; EG-NOT: RECIP_IEEE -define void @test_no_srem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define void @no_sdiv25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 %num.i24.0 = shl i32 %num, 8 - %den.i24.0 = shl i32 %den, 7 + %den.i25.0 = shl i32 %den, 7 %num.i24 = ashr i32 %num.i24.0, 8 - %den.i24 = ashr i32 %den.i24.0, 7 - %result = srem i32 %num.i24, %den.i24 + %den.i25 = ashr i32 %den.i25.0, 7 + %result = sdiv i32 %num.i24, %den.i25 store i32 %result, i32 addrspace(1)* %out, align 4 ret void } -; FUNC-LABEL: {{^}}test_no_srem24_i32_2: +; FUNC-LABEL: {{^}}no_sdiv25_i25_i24_i32: ; SI-NOT: v_cvt_f32_i32 ; SI-NOT: v_rcp_f32 ; EG-NOT: INT_TO_FLT ; EG-NOT: RECIP_IEEE -define void @test_no_srem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define void @no_sdiv25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 7 + %num.i25.0 = shl i32 %num, 7 %den.i24.0 = shl i32 %den, 8 - %num.i24 = ashr i32 %num.i24.0, 7 + %num.i25 = ashr i32 %num.i25.0, 7 %den.i24 = ashr i32 %den.i24.0, 8 - %result = srem i32 %num.i24, %den.i24 + %result = sdiv i32 %num.i25, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}no_srem25_i24_i25_i32: +; SI-NOT: v_cvt_f32_i32 +; SI-NOT: v_rcp_f32 + +; EG-NOT: INT_TO_FLT +; EG-NOT: RECIP_IEEE +define void @no_srem25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 8 + %den.i25.0 = shl i32 %den, 7 + %num.i24 = ashr i32 %num.i24.0, 8 + %den.i25 = ashr i32 %den.i25.0, 7 + %result = srem i32 %num.i24, %den.i25 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}no_srem25_i25_i24_i32: +; SI-NOT: v_cvt_f32_i32 +; SI-NOT: v_rcp_f32 + +; EG-NOT: INT_TO_FLT +; EG-NOT: RECIP_IEEE +define void @no_srem25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i25.0 = shl i32 %num, 7 + %den.i24.0 = shl i32 %den, 8 + %num.i25 = ashr i32 %num.i25.0, 7 + %den.i24 = ashr i32 %den.i24.0, 8 + %result = srem i32 %num.i25, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}srem25_i24_i11_i32: +; SI: v_cvt_f32_i32 +; SI: v_rcp_f32 +; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 24 + +; EG: INT_TO_FLT +; EG: RECIP_IEEE +define void @srem25_i24_i11_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 8 + %den.i11.0 = shl i32 %den, 21 + %num.i24 = ashr i32 %num.i24.0, 8 + %den.i11 = ashr i32 %den.i11.0, 21 + %result = srem i32 %num.i24, %den.i11 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}srem25_i11_i24_i32: +; SI: v_cvt_f32_i32 +; SI: v_rcp_f32 +; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 24 + +; EG: INT_TO_FLT +; EG: RECIP_IEEE +define void @srem25_i11_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i11.0 = shl i32 %num, 21 + %den.i24.0 = shl i32 %den, 8 + %num.i11 = ashr i32 %num.i11.0, 21 + %den.i24 = ashr i32 %den.i24.0, 8 + %result = srem i32 %num.i11, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}srem25_i17_i12_i32: +; SI: v_cvt_f32_i32 +; SI: v_rcp_f32 +; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 17 + +; EG: INT_TO_FLT +; EG: RECIP_IEEE +define void @srem25_i17_i12_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i17.0 = shl i32 %num, 15 + %den.i12.0 = shl i32 %den, 20 + %num.i17 = ashr i32 %num.i17.0, 15 + %den.i12 = ashr i32 %den.i12.0, 20 + %result = sdiv i32 %num.i17, %den.i12 store i32 %result, i32 addrspace(1)* %out, align 4 ret void } Index: test/CodeGen/AMDGPU/udiv.ll =================================================================== --- test/CodeGen/AMDGPU/udiv.ll +++ test/CodeGen/AMDGPU/udiv.ll @@ -91,3 +91,57 @@ store i32 %result, i32 addrspace(1)* %out ret void } + +; FUNC-LABEL: {{^}}v_udiv_i8: +; SI: v_rcp_f32 +; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0xff, v{{[0-9]+}} +; SI: buffer_store_dword [[TRUNC]] +define void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { + %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 + %num = load i8, i8 addrspace(1) * %in + %den = load i8, i8 addrspace(1) * %den_ptr + %result = udiv i8 %num, %den + %result.ext = zext i8 %result to i32 + store i32 %result.ext, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_udiv_i16: +; SI: v_rcp_f32 +; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0xffff, v{{[0-9]+}} +; SI: buffer_store_dword [[TRUNC]] +define void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { + %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 + %num = load i16, i16 addrspace(1) * %in + %den = load i16, i16 addrspace(1) * %den_ptr + %result = udiv i16 %num, %den + %result.ext = zext i16 %result to i32 + store i32 %result.ext, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_udiv_i23: +; SI: v_rcp_f32 +; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0x7fffff, v{{[0-9]+}} +; SI: buffer_store_dword [[TRUNC]] +define void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) { + %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1 + %num = load i23, i23 addrspace(1) * %in + %den = load i23, i23 addrspace(1) * %den_ptr + %result = udiv i23 %num, %den + %result.ext = zext i23 %result to i32 + store i32 %result.ext, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_udiv_i24: +; SI-NOT: v_rcp_f32 +define void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) { + %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1 + %num = load i24, i24 addrspace(1) * %in + %den = load i24, i24 addrspace(1) * %den_ptr + %result = udiv i24 %num, %den + %result.ext = zext i24 %result to i32 + store i32 %result.ext, i32 addrspace(1)* %out + ret void +} Index: test/CodeGen/AMDGPU/udivrem.ll =================================================================== --- test/CodeGen/AMDGPU/udivrem.ll +++ test/CodeGen/AMDGPU/udivrem.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s Index: test/CodeGen/AMDGPU/udivrem24.ll =================================================================== --- test/CodeGen/AMDGPU/udivrem24.ll +++ test/CodeGen/AMDGPU/udivrem24.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}udiv24_i8: @@ -40,7 +40,7 @@ ret void } -; FUNC-LABEL: {{^}}udiv24_i32: +; FUNC-LABEL: {{^}}udiv23_i32: ; SI: v_cvt_f32_u32 ; SI-DAG: v_cvt_f32_u32 ; SI-DAG: v_rcp_f32 @@ -50,6 +50,23 @@ ; EG-DAG: UINT_TO_FLT ; EG-DAG: RECIP_IEEE ; EG: FLT_TO_UINT +define void @udiv23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i23.0 = shl i32 %num, 9 + %den.i23.0 = shl i32 %den, 9 + %num.i23 = lshr i32 %num.i23.0, 9 + %den.i23 = lshr i32 %den.i23.0, 9 + %result = udiv i32 %num.i23, %den.i23 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}udiv24_i32: +; SI: v_rcp_iflag +; SI-NOT v_rcp_f32 +; EG-NOT: RECIP_IEEE define void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 @@ -63,6 +80,40 @@ ret void } +; FUNC-LABEL: {{^}}no_udiv24_u23_u24_i32: +; SI: v_rcp_iflag +; SI-NOT v_rcp_f32 +; EG-NOT: RECIP_IEEE +define void @no_udiv24_u23_u24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i23.0 = shl i32 %num, 9 + %den.i24.0 = shl i32 %den, 8 + %num.i23 = lshr i32 %num.i23.0, 9 + %den.i24 = lshr i32 %den.i24.0, 8 + %result = udiv i32 %num.i23, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}no_udiv24_u24_u23_i32: +; SI: v_rcp_iflag +; SI-NOT v_rcp_f32 +; EG-NOT: RECIP_IEEE +define void @no_udiv24_u24_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 8 + %den.i23.0 = shl i32 %den, 9 + %num.i24 = lshr i32 %num.i24.0, 8 + %den.i23 = lshr i32 %den.i23.0, 9 + %result = udiv i32 %num.i24, %den.i23 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + ; FUNC-LABEL: {{^}}udiv25_i32: ; RCP_IFLAG is for URECIP in the full 32b alg ; SI: v_rcp_iflag @@ -74,11 +125,11 @@ %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 7 - %den.i24.0 = shl i32 %den, 7 - %num.i24 = lshr i32 %num.i24.0, 7 - %den.i24 = lshr i32 %den.i24.0, 7 - %result = udiv i32 %num.i24, %den.i24 + %num.i25.0 = shl i32 %num, 7 + %den.i25.0 = shl i32 %den, 7 + %num.i25 = lshr i32 %num.i25.0, 7 + %den.i25 = lshr i32 %den.i25.0, 7 + %result = udiv i32 %num.i25, %den.i25 store i32 %result, i32 addrspace(1)* %out, align 4 ret void } @@ -162,15 +213,8 @@ } ; FUNC-LABEL: {{^}}urem24_i32: -; SI: v_cvt_f32_u32 -; SI: v_cvt_f32_u32 -; SI: v_rcp_f32 -; SI: v_cvt_u32_f32 - -; EG: UINT_TO_FLT -; EG-DAG: UINT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_UINT +; SI-NOT: v_rcp_f32 +; EG-NOT: RECIP_IEEE define void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %num = load i32, i32 addrspace(1) * %in, align 4 @@ -243,3 +287,41 @@ store i32 %result, i32 addrspace(1)* %out, align 4 ret void } + +; FUNC-LABEL: {{^}}test_udiv24_u16_u23_i32: +; SI-DAG: v_rcp_f32 +; SI-DAG: s_mov_b32 [[MASK:s[0-9]+]], 0x7fffff{{$}} +; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], + +; EG: RECIP_IEEE +define void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i16.0 = shl i32 %num, 16 + %den.i23.0 = shl i32 %den, 9 + %num.i16 = lshr i32 %num.i16.0, 16 + %den.i23 = lshr i32 %den.i23.0, 9 + %result = udiv i32 %num.i16, %den.i23 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_udiv24_u23_u16_i32: +; SI-DAG: v_rcp_f32 +; SI-DAG: s_mov_b32 [[MASK:s[0-9]+]], 0x7fffff{{$}} +; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], + +; EG: RECIP_IEEE +define void @test_udiv24_u23_u16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i23.0 = shl i32 %num, 9 + %den.i16.0 = shl i32 %den, 16 + %num.i23 = lshr i32 %num.i23.0, 9 + %den.i16 = lshr i32 %den.i16.0, 16 + %result = udiv i32 %num.i23, %den.i16 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} Index: test/CodeGen/AMDGPU/udivrem64.ll =================================================================== --- test/CodeGen/AMDGPU/udivrem64.ll +++ test/CodeGen/AMDGPU/udivrem64.ll @@ -1,4 +1,4 @@ -;RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s +;RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s ;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s ;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s @@ -184,7 +184,7 @@ ret void } -;FUNC-LABEL: {{^}}test_udiv2464: +;FUNC-LABEL: {{^}}test_udiv2364: ;EG: UINT_TO_FLT ;EG: UINT_TO_FLT ;EG: FLT_TO_UINT @@ -195,15 +195,15 @@ ;VI-NOT: v_lshrrev_b64 ;GCN: v_mad_f32 ;GCN: s_endpgm -define void @test_udiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { - %1 = lshr i64 %x, 40 - %2 = lshr i64 %y, 40 +define void @test_udiv2364(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %1 = lshr i64 %x, 41 + %2 = lshr i64 %y, 41 %result = udiv i64 %1, %2 store i64 %result, i64 addrspace(1)* %out ret void } -;FUNC-LABEL: {{^}}test_urem2464: +;FUNC-LABEL: {{^}}test_urem2364: ;EG: UINT_TO_FLT ;EG: UINT_TO_FLT ;EG: FLT_TO_UINT @@ -214,9 +214,9 @@ ;VI-NOT: v_lshrrev_b64 ;GCN: v_mad_f32 ;GCN: s_endpgm -define void @test_urem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { - %1 = lshr i64 %x, 40 - %2 = lshr i64 %y, 40 +define void @test_urem2364(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %1 = lshr i64 %x, 41 + %2 = lshr i64 %y, 41 %result = urem i64 %1, %2 store i64 %result, i64 addrspace(1)* %out ret void