Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -478,6 +478,9 @@ setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); + setTargetDAGCombine(ISD::FP_TO_SINT); + setTargetDAGCombine(ISD::FP_TO_UINT); + setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine(ISD::ANY_EXTEND); @@ -7529,6 +7532,94 @@ return SDValue(); } +// isConstVecPow2 - Return true if each vector element is a power of 2, all +// elements are the same constant, C, and Log2(C) ranges from 1 to Range. +static bool isConstVecPow2(SDValue ConstVec, bool IsSigned, uint64_t &C, + unsigned Range) { + integerPart CN; + integerPart C0 = 0; + for (unsigned I = 0, E = ConstVec.getValueType().getVectorNumElements(); + I != E; I++) { + ConstantFPSDNode *C = dyn_cast(ConstVec.getOperand(I)); + if (!C) + return false; + + bool IsExact; + APFloat APF = C->getValueAPF(); + if (APF.convertToInteger(&CN, 64, IsSigned, APFloat::rmTowardZero, + &IsExact) != APFloat::opOK || + !IsExact) + return false; + + C0 = (I == 0) ? CN : C0; + if (!isPowerOf2_64(CN) || C0 != CN || Log2_64(C0) < 1 || + Log2_64(C0) > Range) + return false; + } + C = C0; + return true; +} + +/// FCVTZS (floating-point to fixed-point,Advanced SIMD) can replace +/// combinations of FMUL and FCVT (floating-point to integer) when the FMUL has +/// a constant operand that is a power of 2. +static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + if (!Subtarget->hasNEON()) + return SDValue(); + + SDValue Op = N->getOperand(0); + if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL) + return SDValue(); + + MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); + uint32_t FloatBits = FloatTy.getSizeInBits(); + if (FloatBits != 32 && FloatBits != 64) + return SDValue(); + + MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); + uint32_t IntBits = IntTy.getSizeInBits(); + if (IntBits != 16 && IntBits != 32 && IntBits != 64) + return SDValue(); + + // Avoid conversions like float -> i64. + if (IntBits > FloatBits) + return SDValue(); + + uint64_t C; + SDValue N0 = Op->getOperand(0); + SDValue ConstVec = Op->getOperand(1); + bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; + if (ConstVec.getOpcode() != ISD::BUILD_VECTOR || + !isConstVecPow2(ConstVec, IsSigned, C, IntBits == 64 ? 64 : 32)) + return SDValue(); + + MVT ResTy; + unsigned NumLanes = Op.getValueType().getVectorNumElements(); + switch (NumLanes) { + default: + return SDValue(); + case 2: + ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; + break; + case 4: + ResTy = MVT::v4i32; + break; + } + + SDLoc DL(N); + unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs + : Intrinsic::aarch64_neon_vcvtfp2fxu; + SDValue FixConv = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy, + DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), + N0, DAG.getConstant(Log2_64(C), DL, MVT::i32)); + // We can handle smaller integers by generating an extra trunc. + if (IntBits < FloatBits) + FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv); + + return FixConv; +} + /// An EXTR instruction is made up of two shifts, ORed together. This helper /// searches for and classifies those shifts. static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, @@ -9400,6 +9491,9 @@ case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: return performIntToFpCombine(N, DAG, Subtarget); + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + return performFpToIntCombine(N, DAG, Subtarget); case ISD::OR: return performORCombine(N, DCI, Subtarget); case ISD::INTRINSIC_WO_CHAIN: Index: test/CodeGen/AArch64/fcvt_combine.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/fcvt_combine.ll @@ -0,0 +1,128 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-neon-syntax=apple -verify-machineinstrs -o - %s | FileCheck %s + +; CHECK-LABEL: test1 +; CHECK-NOT: fmul.2s +; CHECK: fcvtzs.2s +; CHECK: ret +define <2 x i32> @test1(float %f) { +entry: + %vecinit1.i = insertelement <2 x float> undef, float %f, i32 0 + %vecinit2.i = insertelement <2 x float> %vecinit1.i, float %f, i32 1 + %mul.i = fmul <2 x float> %vecinit2.i, + %vcvt.i = fptosi <2 x float> %mul.i to <2 x i32> + ret <2 x i32> %vcvt.i +} + +; CHECK-LABEL: test2 +; CHECK-NOT: fmul.4s +; CHECK: fcvtzs.4s +; CHECK: ret +define <4 x i32> @test2(float %f) { +entry: + %vecinit1.i = insertelement <4 x float> undef, float %f, i32 0 + %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %f, i32 1 + %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %f, i32 2 + %vecinit4.i = insertelement <4 x float> %vecinit3.i, float %f, i32 3 + %mul.i = fmul <4 x float> %vecinit4.i, + %vcvt.i = fptosi <4 x float> %mul.i to <4 x i32> + ret <4 x i32> %vcvt.i +} + +; CHECK-LABEL: test3 +; CHECK-NOT: fmul.2d +; CHECK: fcvtzs.2d +; CHECK: ret +define <2 x i64> @test3(double %d) { +entry: + %vecinit1.i = insertelement <2 x double> undef, double %d, i32 0 + %vecinit2.i = insertelement <2 x double> %vecinit1.i, double %d, i32 1 + %mul.i = fmul <2 x double> %vecinit2.i, + %vcvt.i = fptosi <2 x double> %mul.i to <2 x i64> + ret <2 x i64> %vcvt.i +} + +; Truncate double to i32 +; CHECK-LABEL: test4 +; CHECK-NOT: fmul.2d +; CHECK: fcvtzs.2d +; CHECK: xtn.2s +; CHECK: ret +define <2 x i32> @test4(double %d) { +entry: + %vecinit1.i = insertelement <2 x double> undef, double %d, i32 0 + %vecinit2.i = insertelement <2 x double> %vecinit1.i, double %d, i32 1 + %mul.i = fmul <2 x double> %vecinit2.i, + %vcvt.i = fptosi <2 x double> %mul.i to <2 x i32> + ret <2 x i32> %vcvt.i +} + +; Truncate float to i16 +; CHECK-LABEL: test5 +; CHECK-NOT: fmul.2s +; CHECK: fcvtzs.2s +; CHECK: ret +define <2 x i16> @test5(float %f) { +entry: + %vecinit1.i = insertelement <2 x float> undef, float %f, i32 0 + %vecinit2.i = insertelement <2 x float> %vecinit1.i, float %f, i32 1 + %mul.i = fmul <2 x float> %vecinit2.i, + %vcvt.i = fptosi <2 x float> %mul.i to <2 x i16> + ret <2 x i16> %vcvt.i +} + +; Don't convert float to i64 +; CHECK-LABEL: test6 +; CHECK: fmul.2s +; CHECK: fcvtl +; CHECK: fcvtzs.2d +; CHECK: ret +define <2 x i64> @test6(float %f) { +entry: + %vecinit1.i = insertelement <2 x float> undef, float %f, i32 0 + %vecinit2.i = insertelement <2 x float> %vecinit1.i, float %f, i32 1 + %mul.i = fmul <2 x float> %vecinit2.i, + %vcvt.i = fptosi <2 x float> %mul.i to <2 x i64> + ret <2 x i64> %vcvt.i +} + +; Check unsigned conversion. +; CHECK-LABEL: test7 +; CHECK-NOT: fmul.2s +; CHECK: fcvtzu.2s +; CHECK: ret +define <2 x i32> @test7(float %f) { +entry: + %vecinit1.i = insertelement <2 x float> undef, float %f, i32 0 + %vecinit2.i = insertelement <2 x float> %vecinit1.i, float %f, i32 1 + %mul.i = fmul <2 x float> %vecinit2.i, + %vcvt.i = fptoui <2 x float> %mul.i to <2 x i32> + ret <2 x i32> %vcvt.i +} + +; Test which should not fold due to non-power of 2. +; CHECK-LABEL: test8 +; CHECK: fmul.2s +; CHECK: fcvtzu.2s +; CHECK: ret +define <2 x i32> @test8(float %f) { +entry: + %vecinit1.i = insertelement <2 x float> undef, float %f, i32 0 + %vecinit2.i = insertelement <2 x float> %vecinit1.i, float %f, i32 1 + %mul.i = fmul <2 x float> %vecinit2.i, + %vcvt.i = fptoui <2 x float> %mul.i to <2 x i32> + ret <2 x i32> %vcvt.i +} + +; Test which should not fold due to non-matching power of 2. +; CHECK-LABEL: test9 +; CHECK: fmul.2s +; CHECK: fcvtzu.2s +; CHECK: ret +define <2 x i32> @test9(float %f) { +entry: + %vecinit1.i = insertelement <2 x float> undef, float %f, i32 0 + %vecinit2.i = insertelement <2 x float> %vecinit1.i, float %f, i32 1 + %mul.i = fmul <2 x float> %vecinit2.i, + %vcvt.i = fptoui <2 x float> %mul.i to <2 x i32> + ret <2 x i32> %vcvt.i +}