diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -523,6 +523,54 @@
     }
     break;
   }
+  case Intrinsic::fshl: {
+    if (ICA.getArgs().empty())
+      break;
+
+    // TODO: Add handling for fshl where third argument is not a constant.
+    // TODO: Also use for fshr.
+    const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
+    if (!OpInfoZ.isConstant())
+      break;
+
+    const auto LegalisationCost = getTypeLegalizationCost(RetTy);
+    if (OpInfoZ.isUniform()) {
+      // FIXME: The costs could be lower if the codegen is better.
+      static const CostTblEntry FshlTbl[] = {
+          {Intrinsic::fshl, MVT::v4i32, 3}, // ushr + shl + orr
+          {Intrinsic::fshl, MVT::v2i64, 3}, {Intrinsic::fshl, MVT::v16i8, 4},
+          {Intrinsic::fshl, MVT::v8i16, 4}, {Intrinsic::fshl, MVT::v2i32, 3},
+          {Intrinsic::fshl, MVT::v8i8, 4},  {Intrinsic::fshl, MVT::v4i16, 4}};
+      const auto *Entry =
+          CostTableLookup(FshlTbl, ICA.getID(), LegalisationCost.second);
+      if (Entry)
+        return LegalisationCost.first * Entry->Cost;
+    }
+
+    auto TyL = getTypeLegalizationCost(RetTy);
+    int64_t NumElements = 1;
+    if (RetTy->isIntegerTy())
+      break;
+
+    // Estimate cost manually, as types like i8 and i16 will get promoted to
+    // i32 and CostTableLookup will ignore the extra conversion cost.
+    bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
+                       RetTy->getScalarSizeInBits() != 64 &&
+                       RetTy->getScalarSizeInBits() < 64) ||
+                      (RetTy->getScalarSizeInBits() % 64 != 0);
+    unsigned Cost = 0;
+    if (RetTy->getScalarSizeInBits() == 32 ||
+        RetTy->getScalarSizeInBits() == 64)
+      Cost = 1; // fhsl for i32 and i64 can be lowered to a single extr
+                // instruction.
+    else if (HigherCost) {
+      Cost = 2;
+      dbgs() << "Cost: " << TyL.first * Cost << "\n";
+    } else
+      break;
+
+    return TyL.first * Cost * NumElements;
+  }
   default:
     break;
   }
diff --git a/llvm/test/Analysis/CostModel/AArch64/fshl.ll b/llvm/test/Analysis/CostModel/AArch64/fshl.ll
--- a/llvm/test/Analysis/CostModel/AArch64/fshl.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/fshl.ll
@@ -96,7 +96,7 @@
 
 define <16 x i8> @fshl_v16i8_3rd_arg_vec_const_all_lanes_same(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: 'fshl_v16i8_3rd_arg_vec_const_all_lanes_same'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %fshl
 ;
 entry:
@@ -106,7 +106,8 @@
 
 define <16 x i8> @fshl_v16i8_3rd_arg_vec_const_lanes_different(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: 'fshl_v16i8_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
+; CHECK-NEXT:  Cost: 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %fshl
 ;
 entry:
@@ -128,7 +129,7 @@
 
 define <8 x i16> @fshl_v8i16_3rd_arg_vec_const_all_lanes_same(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: 'fshl_v8i16_3rd_arg_vec_const_all_lanes_same'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %fshl
 ;
 entry:
@@ -138,7 +139,8 @@
 
 define <8 x i16> @fshl_v8i16_3rd_arg_vec_const_lanes_different(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: 'fshl_v8i16_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
+; CHECK-NEXT:  Cost: 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %fshl
 ;
 entry:
@@ -160,7 +162,7 @@
 
 define <4 x i32> @fshl_v4i32_3rd_arg_vec_const_all_lanes_same(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: 'fshl_v4i32_3rd_arg_vec_const_all_lanes_same'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %fshl
 ;
 entry:
@@ -170,7 +172,7 @@
 
 define <4 x i32> @fshl_v4i32_3rd_arg_vec_const_lanes_different(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: 'fshl_v4i32_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %fshl
 ;
 entry:
@@ -192,7 +194,7 @@
 
 define <2 x i64> @fshl_v2i64_3rd_arg_vec_const_all_lanes_same(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: 'fshl_v2i64_3rd_arg_vec_const_all_lanes_same'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fshl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 1>)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fshl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 1>)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %fshl
 ;
 entry:
@@ -202,7 +204,7 @@
 
 define <2 x i64> @fshl_v2i64_3rd_arg_vec_const_lanes_different(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: 'fshl_v2i64_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fshl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 2>)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fshl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 2>)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %fshl
 ;
 entry:
@@ -222,6 +224,18 @@
 
 declare <2 x i64> @llvm.fshl.v4i64(<2 x i64>, <2 x i64>, <2 x i64>)
 
+define <2 x i66> @fshl_v2i66_3rd_arg_vec_const_lanes_different(<2 x i66> %a, <2 x i66> %b) {
+; CHECK-LABEL: 'fshl_v2i66_3rd_arg_vec_const_lanes_different'
+; CHECK-NEXT:  Cost: 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fshl = tail call <2 x i66> @llvm.fshl.v2i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i66> %fshl
+;
+entry:
+  %fshl = tail call <2 x i66> @llvm.fshl.v4i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
+  ret <2 x i66> %fshl
+}
+declare <2 x i66> @llvm.fshl.v4i66(<2 x i66>, <2 x i66>, <2 x i66>)
+
 define <4 x i30> @fshl_v4i30_3rd_arg_var(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c) {
 ; CHECK-LABEL: 'fshl_v4i30_3rd_arg_var'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %fshl = tail call <4 x i30> @llvm.fshl.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/fshl.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/fshl.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/fshl.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/fshl.ll
@@ -6,21 +6,18 @@
 ; CHECK-LABEL: define i64 @fshl
 ; CHECK-SAME: (i64 [[OR1:%.*]], i64 [[OR2:%.*]], i64 [[OR3:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i64> poison, i64 [[OR2]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> [[TMP0]], i64 [[OR3]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> [[TMP1]], <2 x i64> zeroinitializer, <2 x i64> <i64 1, i64 2>)
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[OR1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> <i64 poison, i64 0>, <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> <i64 17, i64 21>)
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> <i64 0, i64 poison>, <2 x i32> <i32 2, i32 0>
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <2 x i64> [[TMP2]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i64> [[TMP7]], [[TMP3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <2 x i64> [[TMP5]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP8]], i32 1
-; CHECK-NEXT:    [[ADD3:%.*]] = or i64 [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
-; CHECK-NEXT:    [[XOR5:%.*]] = xor i64 [[ADD3]], [[TMP12]]
+; CHECK-NEXT:    [[OR4:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[OR2]], i64 0, i64 1)
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i64 [[OR4]], 0
+; CHECK-NEXT:    [[OR5:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[OR3]], i64 0, i64 2)
+; CHECK-NEXT:    [[XOR2:%.*]] = xor i64 [[OR5]], [[OR1]]
+; CHECK-NEXT:    [[ADD1:%.*]] = add i64 [[XOR1]], [[OR1]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add i64 0, [[XOR2]]
+; CHECK-NEXT:    [[OR6:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[OR1]], i64 [[OR2]], i64 17)
+; CHECK-NEXT:    [[XOR3:%.*]] = xor i64 [[OR6]], [[ADD1]]
+; CHECK-NEXT:    [[OR7:%.*]] = tail call i64 @llvm.fshl.i64(i64 0, i64 0, i64 21)
+; CHECK-NEXT:    [[XOR4:%.*]] = xor i64 [[OR7]], [[ADD2]]
+; CHECK-NEXT:    [[ADD3:%.*]] = or i64 [[XOR3]], [[ADD2]]
+; CHECK-NEXT:    [[XOR5:%.*]] = xor i64 [[ADD3]], [[XOR4]]
 ; CHECK-NEXT:    ret i64 [[XOR5]]
 ;
 entry: