diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -523,6 +523,54 @@ } break; } + case Intrinsic::fshl: { + if (ICA.getArgs().empty()) + break; + + // TODO: Add handling for fshl where third argument is not a constant. + // TODO: Also use for fshr. + const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]); + if (!OpInfoZ.isConstant()) + break; + + const auto LegalisationCost = getTypeLegalizationCost(RetTy); + if (OpInfoZ.isUniform()) { + // FIXME: The costs could be lower if the codegen is better. + static const CostTblEntry FshlTbl[] = { + {Intrinsic::fshl, MVT::v4i32, 3}, // ushr + shl + orr + {Intrinsic::fshl, MVT::v2i64, 3}, {Intrinsic::fshl, MVT::v16i8, 4}, + {Intrinsic::fshl, MVT::v8i16, 4}, {Intrinsic::fshl, MVT::v2i32, 3}, + {Intrinsic::fshl, MVT::v8i8, 4}, {Intrinsic::fshl, MVT::v4i16, 4}}; + const auto *Entry = + CostTableLookup(FshlTbl, ICA.getID(), LegalisationCost.second); + if (Entry) + return LegalisationCost.first * Entry->Cost; + } + + auto TyL = getTypeLegalizationCost(RetTy); + int64_t NumElements = 1; + if (RetTy->isIntegerTy()) + break; + + // Estimate cost manually, as types like i8 and i16 will get promoted to + // i32 and CostTableLookup will ignore the extra conversion cost. + bool HigherCost = (RetTy->getScalarSizeInBits() != 32 && + RetTy->getScalarSizeInBits() != 64 && + RetTy->getScalarSizeInBits() < 64) || + (RetTy->getScalarSizeInBits() % 64 != 0); + unsigned Cost = 0; + if (RetTy->getScalarSizeInBits() == 32 || + RetTy->getScalarSizeInBits() == 64) + Cost = 1; // fhsl for i32 and i64 can be lowered to a single extr + // instruction. + else if (HigherCost) { + Cost = 2; + dbgs() << "Cost: " << TyL.first * Cost << "\n"; + } else + break; + + return TyL.first * Cost * NumElements; + } default: break; } diff --git a/llvm/test/Analysis/CostModel/AArch64/fshl.ll b/llvm/test/Analysis/CostModel/AArch64/fshl.ll --- a/llvm/test/Analysis/CostModel/AArch64/fshl.ll +++ b/llvm/test/Analysis/CostModel/AArch64/fshl.ll @@ -96,7 +96,7 @@ define <16 x i8> @fshl_v16i8_3rd_arg_vec_const_all_lanes_same(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: 'fshl_v16i8_3rd_arg_vec_const_all_lanes_same' -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %fshl ; entry: @@ -106,7 +106,8 @@ define <16 x i8> @fshl_v16i8_3rd_arg_vec_const_lanes_different(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: 'fshl_v16i8_3rd_arg_vec_const_lanes_different' -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) +; CHECK-NEXT: Cost: 2 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %fshl ; entry: @@ -128,7 +129,7 @@ define <8 x i16> @fshl_v8i16_3rd_arg_vec_const_all_lanes_same(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: 'fshl_v8i16_3rd_arg_vec_const_all_lanes_same' -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> ) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> ) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %fshl ; entry: @@ -138,7 +139,8 @@ define <8 x i16> @fshl_v8i16_3rd_arg_vec_const_lanes_different(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: 'fshl_v8i16_3rd_arg_vec_const_lanes_different' -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> ) +; CHECK-NEXT: Cost: 2 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> ) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %fshl ; entry: @@ -160,7 +162,7 @@ define <4 x i32> @fshl_v4i32_3rd_arg_vec_const_all_lanes_same(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: 'fshl_v4i32_3rd_arg_vec_const_all_lanes_same' -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> ) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> ) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %fshl ; entry: @@ -170,7 +172,7 @@ define <4 x i32> @fshl_v4i32_3rd_arg_vec_const_lanes_different(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: 'fshl_v4i32_3rd_arg_vec_const_lanes_different' -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> ) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> ) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %fshl ; entry: @@ -192,7 +194,7 @@ define <2 x i64> @fshl_v2i64_3rd_arg_vec_const_all_lanes_same(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: 'fshl_v2i64_3rd_arg_vec_const_all_lanes_same' -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %fshl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> ) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fshl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> ) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %fshl ; entry: @@ -202,7 +204,7 @@ define <2 x i64> @fshl_v2i64_3rd_arg_vec_const_lanes_different(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: 'fshl_v2i64_3rd_arg_vec_const_lanes_different' -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %fshl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> ) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fshl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> ) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %fshl ; entry: @@ -222,6 +224,18 @@ declare <2 x i64> @llvm.fshl.v4i64(<2 x i64>, <2 x i64>, <2 x i64>) +define <2 x i66> @fshl_v2i66_3rd_arg_vec_const_lanes_different(<2 x i66> %a, <2 x i66> %b) { +; CHECK-LABEL: 'fshl_v2i66_3rd_arg_vec_const_lanes_different' +; CHECK-NEXT: Cost: 8 +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fshl = tail call <2 x i66> @llvm.fshl.v2i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> ) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i66> %fshl +; +entry: + %fshl = tail call <2 x i66> @llvm.fshl.v4i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> ) + ret <2 x i66> %fshl +} +declare <2 x i66> @llvm.fshl.v4i66(<2 x i66>, <2 x i66>, <2 x i66>) + define <4 x i30> @fshl_v4i30_3rd_arg_var(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c) { ; CHECK-LABEL: 'fshl_v4i30_3rd_arg_var' ; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %fshl = tail call <4 x i30> @llvm.fshl.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c) diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/fshl.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/fshl.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/fshl.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/fshl.ll @@ -6,21 +6,18 @@ ; CHECK-LABEL: define i64 @fshl ; CHECK-SAME: (i64 [[OR1:%.*]], i64 [[OR2:%.*]], i64 [[OR3:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i64> poison, i64 [[OR2]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> [[TMP0]], i64 [[OR3]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> [[TMP1]], <2 x i64> zeroinitializer, <2 x i64> ) -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> , i64 [[OR1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> , <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> ) -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> , <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = xor <2 x i64> [[TMP2]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i64> [[TMP7]], [[TMP3]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <2 x i64> [[TMP5]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP8]], i32 1 -; CHECK-NEXT: [[ADD3:%.*]] = or i64 [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1 -; CHECK-NEXT: [[XOR5:%.*]] = xor i64 [[ADD3]], [[TMP12]] +; CHECK-NEXT: [[OR4:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[OR2]], i64 0, i64 1) +; CHECK-NEXT: [[XOR1:%.*]] = xor i64 [[OR4]], 0 +; CHECK-NEXT: [[OR5:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[OR3]], i64 0, i64 2) +; CHECK-NEXT: [[XOR2:%.*]] = xor i64 [[OR5]], [[OR1]] +; CHECK-NEXT: [[ADD1:%.*]] = add i64 [[XOR1]], [[OR1]] +; CHECK-NEXT: [[ADD2:%.*]] = add i64 0, [[XOR2]] +; CHECK-NEXT: [[OR6:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[OR1]], i64 [[OR2]], i64 17) +; CHECK-NEXT: [[XOR3:%.*]] = xor i64 [[OR6]], [[ADD1]] +; CHECK-NEXT: [[OR7:%.*]] = tail call i64 @llvm.fshl.i64(i64 0, i64 0, i64 21) +; CHECK-NEXT: [[XOR4:%.*]] = xor i64 [[OR7]], [[ADD2]] +; CHECK-NEXT: [[ADD3:%.*]] = or i64 [[XOR3]], [[ADD2]] +; CHECK-NEXT: [[XOR5:%.*]] = xor i64 [[ADD3]], [[XOR4]] ; CHECK-NEXT: ret i64 [[XOR5]] ; entry: