diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -382,9 +382,13 @@
                                         "equivalent when the immediate does "
                                         "not fit in the encoding.">;
 
-def FeatureLSLFast : SubtargetFeature<
-    "lsl-fast", "HasLSLFast", "true",
-    "CPU has a fastpath logical shift of up to 3 places">;
+def FeatureAddrLSLFast : SubtargetFeature<
+    "addr-lsl-fast", "HasAddrLSLFast", "true",
+    "Address operands with logical shift of up to 3 places are cheap">;
+
+def FeatureALULSLFast : SubtargetFeature<
+    "alu-lsl-fast", "HasALULSLFast", "true",
+    "Add/Sub operations with lsl shift <= 4 are cheap">;
 
 def FeatureAggressiveFMA :
   SubtargetFeature<"aggressive-fma",
@@ -841,7 +845,8 @@
                                    "Cortex-A76 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeatureLSLFast,
+                                   FeatureAddrLSLFast,
+                                   FeatureALULSLFast,
                                    FeatureEnableSelectOptimize,
                                    FeaturePredictableSelectIsExpensive]>;
 
@@ -850,7 +855,8 @@
                                    FeatureCmpBccFusion,
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeatureLSLFast,
+                                   FeatureAddrLSLFast,
+                                   FeatureALULSLFast,
                                    FeatureEnableSelectOptimize,
                                    FeaturePredictableSelectIsExpensive]>;
 
@@ -859,7 +865,8 @@
                                FeatureCmpBccFusion,
                                FeatureFuseAES,
                                FeatureFuseAdrpAdd,
-                               FeatureLSLFast,
+                               FeatureAddrLSLFast,
+                               FeatureALULSLFast,
                                FeaturePostRAScheduler,
                                FeatureEnableSelectOptimize,
                                FeaturePredictableSelectIsExpensive]>;
@@ -870,7 +877,8 @@
                                 FeatureCmpBccFusion,
                                 FeatureFuseAES,
                                 FeatureFuseAdrpAdd,
-                                FeatureLSLFast,
+                                FeatureAddrLSLFast,
+                                FeatureALULSLFast,
                                 FeaturePostRAScheduler,
                                 FeatureEnableSelectOptimize,
                                 FeaturePredictableSelectIsExpensive]>;
@@ -880,7 +888,8 @@
                                    FeatureCmpBccFusion,
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeatureLSLFast,
+                                   FeatureAddrLSLFast,
+                                   FeatureALULSLFast,
                                    FeaturePostRAScheduler,
                                    FeatureEnableSelectOptimize,
                                    FeaturePredictableSelectIsExpensive]>;
@@ -890,7 +899,8 @@
                                  FeatureFuseAES,
                                  FeaturePostRAScheduler,
                                  FeatureCmpBccFusion,
-                                 FeatureLSLFast,
+                                 FeatureAddrLSLFast,
+                                 FeatureALULSLFast,
                                  FeatureFuseAdrpAdd,
                                  FeatureEnableSelectOptimize,
                                  FeaturePredictableSelectIsExpensive]>;
@@ -905,7 +915,8 @@
                                   FeatureCmpBccFusion,
                                   FeatureFuseAES,
                                   FeatureFuseAdrpAdd,
-                                  FeatureLSLFast,
+                                  FeatureAddrLSLFast,
+                                  FeatureALULSLFast,
                                   FeaturePostRAScheduler,
                                   FeatureEnableSelectOptimize,
                                   FeaturePredictableSelectIsExpensive]>;
@@ -915,14 +926,16 @@
                                   FeatureCmpBccFusion,
                                   FeatureFuseAES,
                                   FeatureFuseAdrpAdd,
-                                  FeatureLSLFast,
+                                  FeatureAddrLSLFast,
+                                  FeatureALULSLFast,
                                   FeaturePostRAScheduler,
                                   FeatureEnableSelectOptimize,
                                   FeaturePredictableSelectIsExpensive]>;
 
 def TuneX3 : SubtargetFeature<"cortex-x3", "ARMProcFamily", "CortexX3",
                               "Cortex-X3 ARM processors", [
-                               FeatureLSLFast,
+                               FeatureAddrLSLFast,
+                               FeatureALULSLFast,
                                FeatureFuseAdrpAdd,
                                FeatureFuseAES,
                                FeaturePostRAScheduler,
@@ -1060,7 +1073,8 @@
                                      FeatureFuseCCSelect,
                                      FeatureFuseAdrpAdd,
                                      FeatureFuseLiterals,
-                                     FeatureLSLFast,
+                                     FeatureAddrLSLFast,
+                                     FeatureALULSLFast,
                                      FeaturePostRAScheduler,
                                      FeaturePredictableSelectIsExpensive]>;
 
@@ -1077,7 +1091,8 @@
                                      FeatureFuseCCSelect,
                                      FeatureFuseAdrpAdd,
                                      FeatureFuseLiterals,
-                                     FeatureLSLFast,
+                                     FeatureAddrLSLFast,
+                                     FeatureALULSLFast,
                                      FeaturePostRAScheduler,
                                      FeatureZCZeroing]>;
 
@@ -1087,7 +1102,8 @@
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
                                    FeatureZCZeroing,
-                                   FeatureLSLFast]
+                                   FeatureAddrLSLFast,
+                                   FeatureALULSLFast]
                                    >;
 
 def TuneFalkor  : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
@@ -1096,7 +1112,8 @@
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
                                    FeatureZCZeroing,
-                                   FeatureLSLFast,
+                                   FeatureAddrLSLFast,
+                                   FeatureALULSLFast,
                                    FeatureSlowSTRQro
                                    ]>;
 
@@ -1110,7 +1127,8 @@
                                       "Neoverse N1 ARM processors", [
                                       FeatureFuseAES,
                                       FeatureFuseAdrpAdd,
-                                      FeatureLSLFast,
+                                      FeatureAddrLSLFast,
+                                      FeatureALULSLFast,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
                                       FeaturePredictableSelectIsExpensive]>;
@@ -1119,7 +1137,8 @@
                                       "Neoverse N2 ARM processors", [
                                       FeatureFuseAES,
                                       FeatureFuseAdrpAdd,
-                                      FeatureLSLFast,
+                                      FeatureAddrLSLFast,
+                                      FeatureALULSLFast,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
                                       FeaturePredictableSelectIsExpensive]>;
@@ -1128,7 +1147,8 @@
                                       "Neoverse 512-TVB ARM processors", [
                                       FeatureFuseAES,
                                       FeatureFuseAdrpAdd,
-                                      FeatureLSLFast,
+                                      FeatureAddrLSLFast,
+                                      FeatureALULSLFast,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
                                       FeaturePredictableSelectIsExpensive]>;
@@ -1137,7 +1157,8 @@
                                       "Neoverse V1 ARM processors", [
                                       FeatureFuseAES,
                                       FeatureFuseAdrpAdd,
-                                      FeatureLSLFast,
+                                      FeatureAddrLSLFast,
+                                      FeatureALULSLFast,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
                                       FeaturePredictableSelectIsExpensive,
@@ -1147,7 +1168,8 @@
                                       "Neoverse V2 ARM processors", [
                                       FeatureFuseAES,
                                       FeatureFuseAdrpAdd,
-                                      FeatureLSLFast,
+                                      FeatureAddrLSLFast,
+                                      FeatureALULSLFast,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
                                       FeaturePredictableSelectIsExpensive]>;
@@ -1158,7 +1180,8 @@
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
                                    FeatureZCZeroing,
-                                   FeatureLSLFast]>;
+                                   FeatureAddrLSLFast,
+                                   FeatureALULSLFast]>;
 
 def TuneThunderX2T99  : SubtargetFeature<"thunderx2t99", "ARMProcFamily", "ThunderX2T99",
                                          "Cavium ThunderX2 processors", [
@@ -1210,7 +1233,8 @@
                                    "Ampere Computing Ampere-1 processors", [
                                    FeaturePostRAScheduler,
                                    FeatureFuseAES,
-                                   FeatureLSLFast,
+                                   FeatureAddrLSLFast,
+                                   FeatureALULSLFast,
                                    FeatureAggressiveFMA,
                                    FeatureArithmeticBccFusion,
                                    FeatureCmpBccFusion,
@@ -1221,7 +1245,8 @@
                                     "Ampere Computing Ampere-1A processors", [
                                     FeaturePostRAScheduler,
                                     FeatureFuseAES,
-                                    FeatureLSLFast,
+                                    FeatureAddrLSLFast,
+                                    FeatureALULSLFast,
                                     FeatureAggressiveFMA,
                                     FeatureArithmeticBccFusion,
                                     FeatureCmpBccFusion,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -451,7 +451,8 @@
   bool SelectAddrModeXRO(SDValue N, unsigned Size, SDValue &Base,
                          SDValue &Offset, SDValue &SignExtend,
                          SDValue &DoShift);
-  bool isWorthFolding(SDValue V) const;
+  bool isWorthFoldingALU(SDValue V, bool LSL = false) const;
+  bool isWorthFoldingAddr(SDValue V) const;
   bool SelectExtendedSHL(SDValue N, unsigned Size, bool WantExtend,
                          SDValue &Offset, SDValue &SignExtend);
 
@@ -660,18 +661,19 @@
   return true;
 }
 
-/// Determine whether it is worth to fold V into an extended register.
-bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
+/// Determine whether it is worth to fold V into an extended register addressing
+/// mode.
+bool AArch64DAGToDAGISel::isWorthFoldingAddr(SDValue V) const {
   // Trivial if we are optimizing for code size or if there is only
   // one use of the value.
   if (CurDAG->shouldOptForSize() || V.hasOneUse())
     return true;
   // If a subtarget has a fastpath LSL we can fold a logical shift into
   // the addressing mode and save a cycle.
-  if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::SHL &&
+  if (Subtarget->hasAddrLSLFast() && V.getOpcode() == ISD::SHL &&
       isWorthFoldingSHL(V))
     return true;
-  if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::ADD) {
+  if (Subtarget->hasAddrLSLFast() && V.getOpcode() == ISD::ADD) {
     const SDValue LHS = V.getOperand(0);
     const SDValue RHS = V.getOperand(1);
     if (LHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(LHS))
@@ -762,35 +764,6 @@
   return true;
 }
 
-/// SelectShiftedRegister - Select a "shifted register" operand.  If the value
-/// is not shifted, set the Shift operand to default of "LSL 0".  The logical
-/// instructions allow the shifted register to be rotated, but the arithmetic
-/// instructions do not.  The AllowROR parameter specifies whether ROR is
-/// supported.
-bool AArch64DAGToDAGISel::SelectShiftedRegister(SDValue N, bool AllowROR,
-                                                SDValue &Reg, SDValue &Shift) {
-  if (SelectShiftedRegisterFromAnd(N, Reg, Shift))
-    return true;
-
-  AArch64_AM::ShiftExtendType ShType = getShiftTypeForNode(N);
-  if (ShType == AArch64_AM::InvalidShiftExtend)
-    return false;
-  if (!AllowROR && ShType == AArch64_AM::ROR)
-    return false;
-
-  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
-    unsigned BitSize = N.getValueSizeInBits();
-    unsigned Val = RHS->getZExtValue() & (BitSize - 1);
-    unsigned ShVal = AArch64_AM::getShifterImm(ShType, Val);
-
-    Reg = N.getOperand(0);
-    Shift = CurDAG->getTargetConstant(ShVal, SDLoc(N), MVT::i32);
-    return isWorthFolding(N);
-  }
-
-  return false;
-}
-
 /// getExtendTypeForNode - Translate an extend node to the corresponding
 /// ExtendType value.
 static AArch64_AM::ShiftExtendType
@@ -845,6 +818,56 @@
   return AArch64_AM::InvalidShiftExtend;
 }
 
+/// Determine whether it is worth to fold V into an extended register of an
+/// Add/Sub. LSL means we are folding into an `add w0, w1, w2, lsl #N`
+/// instruction, and the shift should be treated as worth folding even if has
+/// multiple uses.
+bool AArch64DAGToDAGISel::isWorthFoldingALU(SDValue V, bool LSL) const {
+  // Trivial if we are optimizing for code size or if there is only
+  // one use of the value.
+  if (CurDAG->shouldOptForSize() || V.hasOneUse())
+    return true;
+
+  // If a subtarget has a fastpath LSL we can fold a logical shift into
+  // the add/sub and save a cycle.
+  if (LSL && Subtarget->hasALULSLFast() && V.getOpcode() == ISD::SHL &&
+      V.getConstantOperandVal(1) <= 4 &&
+      getExtendTypeForNode(V.getOperand(0)) == AArch64_AM::InvalidShiftExtend)
+    return true;
+
+  // It hurts otherwise, since the value will be reused.
+  return false;
+}
+
+/// SelectShiftedRegister - Select a "shifted register" operand.  If the value
+/// is not shifted, set the Shift operand to default of "LSL 0".  The logical
+/// instructions allow the shifted register to be rotated, but the arithmetic
+/// instructions do not.  The AllowROR parameter specifies whether ROR is
+/// supported.
+bool AArch64DAGToDAGISel::SelectShiftedRegister(SDValue N, bool AllowROR,
+                                                SDValue &Reg, SDValue &Shift) {
+  if (SelectShiftedRegisterFromAnd(N, Reg, Shift))
+    return true;
+
+  AArch64_AM::ShiftExtendType ShType = getShiftTypeForNode(N);
+  if (ShType == AArch64_AM::InvalidShiftExtend)
+    return false;
+  if (!AllowROR && ShType == AArch64_AM::ROR)
+    return false;
+
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    unsigned BitSize = N.getValueSizeInBits();
+    unsigned Val = RHS->getZExtValue() & (BitSize - 1);
+    unsigned ShVal = AArch64_AM::getShifterImm(ShType, Val);
+
+    Reg = N.getOperand(0);
+    Shift = CurDAG->getTargetConstant(ShVal, SDLoc(N), MVT::i32);
+    return isWorthFoldingALU(N, true);
+  }
+
+  return false;
+}
+
 /// Instructions that accept extend modifiers like UXTW expect the register
 /// being extended to be a GPR32, but the incoming DAG might be acting on a
 /// GPR64 (either via SEXT_INREG or AND). Extract the appropriate low bits if
@@ -925,7 +948,7 @@
   Reg = narrowIfNeeded(CurDAG, Reg);
   Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), SDLoc(N),
                                     MVT::i32);
-  return isWorthFolding(N);
+  return isWorthFoldingALU(N);
 }
 
 /// SelectArithUXTXRegister - Select a "UXTX register" operand. This
@@ -949,7 +972,7 @@
   Reg = N.getOperand(0);
   Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), SDLoc(N),
                                     MVT::i32);
-  return isWorthFolding(N);
+  return isWorthFoldingALU(N);
 }
 
 /// If there's a use of this ADDlow that's not itself a load/store then we'll
@@ -1164,7 +1187,7 @@
   if (ShiftVal != 0 && ShiftVal != LegalShiftVal)
     return false;
 
-  return isWorthFolding(N);
+  return isWorthFoldingAddr(N);
 }
 
 bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
@@ -1192,7 +1215,7 @@
   }
 
   // Remember if it is worth folding N when it produces extended register.
-  bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
+  bool IsExtendedRegisterWorthFolding = isWorthFoldingAddr(N);
 
   // Try to match a shifted extend on the RHS.
   if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
@@ -1222,7 +1245,7 @@
     Offset = narrowIfNeeded(CurDAG, LHS.getOperand(0));
     SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
                                            MVT::i32);
-    if (isWorthFolding(LHS))
+    if (isWorthFoldingAddr(LHS))
       return true;
   }
 
@@ -1234,7 +1257,7 @@
     Offset = narrowIfNeeded(CurDAG, RHS.getOperand(0));
     SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
                                            MVT::i32);
-    if (isWorthFolding(RHS))
+    if (isWorthFoldingAddr(RHS))
       return true;
   }
 
@@ -1305,7 +1328,7 @@
   }
 
   // Remember if it is worth folding N when it produces extended register.
-  bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
+  bool IsExtendedRegisterWorthFolding = isWorthFoldingAddr(N);
 
   // Try to match a shifted extend on the RHS.
   if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16454,7 +16454,7 @@
     } else if (SCVPlus1.isPowerOf2()) {
       ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
       return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
-    } else if (Subtarget->hasLSLFast() &&
+    } else if (Subtarget->hasALULSLFast() &&
                isPowPlusPlusConst(ConstValue, CVM, CVN)) {
       APInt CVMMinus1 = CVM - 1;
       APInt CVNMinus1 = CVN - 1;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -6079,7 +6079,7 @@
 
   // It's better to avoid folding and recomputing shifts when we don't have a
   // fastpath.
-  if (!STI.hasLSLFast())
+  if (!STI.hasAddrLSLFast())
     return false;
 
   // We have a fastpath, so folding a shift in and potentially computing it
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir b/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
--- a/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
@@ -24,7 +24,7 @@
   define void @ldbbrox(i64* %addr) { ret void }
   define void @ldrqrox(i64* %addr) { ret void }
   attributes #0 = { optsize }
-  attributes #1 = { "target-features"="+lsl-fast" }
+  attributes #1 = { "target-features"="+addr-lsl-fast" }
 ...
 
 ---
diff --git a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
--- a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK0
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+lsl-fast | FileCheck %s --check-prefixes=CHECK,CHECK3
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+addr-lsl-fast | FileCheck %s --check-prefixes=CHECK,CHECK3
 
 %struct.a = type [256 x i16]
 %struct.b = type [256 x i32]
diff --git a/llvm/test/CodeGen/AArch64/lslfast.ll b/llvm/test/CodeGen/AArch64/lslfast.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/lslfast.ll
@@ -0,0 +1,129 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK-SLOW
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+alu-lsl-fast | FileCheck %s --check-prefixes=CHECK,CHECK-FAST
+
+define i32 @testmul3(i32 noundef %x, i32 noundef %y, i32 noundef %z) {
+; CHECK-SLOW-LABEL: testmul3:
+; CHECK-SLOW:       // %bb.0: // %entry
+; CHECK-SLOW-NEXT:    lsl w8, w0, #3
+; CHECK-SLOW-NEXT:    add w9, w8, w1
+; CHECK-SLOW-NEXT:    add w8, w8, w2
+; CHECK-SLOW-NEXT:    mul w0, w8, w9
+; CHECK-SLOW-NEXT:    ret
+;
+; CHECK-FAST-LABEL: testmul3:
+; CHECK-FAST:       // %bb.0: // %entry
+; CHECK-FAST-NEXT:    add w8, w1, w0, lsl #3
+; CHECK-FAST-NEXT:    add w9, w2, w0, lsl #3
+; CHECK-FAST-NEXT:    mul w0, w9, w8
+; CHECK-FAST-NEXT:    ret
+entry:
+  %shl = shl i32 %x, 3
+  %add = add nsw i32 %shl, %y
+  %add2 = add nsw i32 %shl, %z
+  %mul = mul nsw i32 %add2, %add
+  ret i32 %mul
+}
+
+define i32 @testvar(i32 noundef %x, i32 noundef %y, i32 noundef %z, i32 %zz) {
+; CHECK-LABEL: testvar:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl w8, w0, w3
+; CHECK-NEXT:    add w9, w8, w1
+; CHECK-NEXT:    add w8, w8, w2
+; CHECK-NEXT:    mul w0, w8, w9
+; CHECK-NEXT:    ret
+entry:
+  %shl = shl i32 %x, %zz
+  %add = add nsw i32 %shl, %y
+  %add2 = add nsw i32 %shl, %z
+  %mul = mul nsw i32 %add2, %add
+  ret i32 %mul
+}
+
+define i32 @testmul5(i32 noundef %x, i32 noundef %y, i32 noundef %z) {
+; CHECK-LABEL: testmul5:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl w8, w0, #5
+; CHECK-NEXT:    add w9, w8, w1
+; CHECK-NEXT:    add w8, w8, w2
+; CHECK-NEXT:    mul w0, w8, w9
+; CHECK-NEXT:    ret
+entry:
+  %shl = shl i32 %x, 5
+  %add = add nsw i32 %shl, %y
+  %add2 = add nsw i32 %shl, %z
+  %mul = mul nsw i32 %add2, %add
+  ret i32 %mul
+}
+
+define i64 @testsext3(i32 noundef %x, i64 noundef %y, i64 noundef %z) {
+; CHECK-LABEL: testsext3:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sbfiz x8, x0, #3, #32
+; CHECK-NEXT:    add x9, x8, x1
+; CHECK-NEXT:    add x8, x8, x2
+; CHECK-NEXT:    mul x0, x9, x8
+; CHECK-NEXT:    ret
+entry:
+  %conv = sext i32 %x to i64
+  %shl = shl nsw i64 %conv, 3
+  %add = add nsw i64 %shl, %y
+  %add3 = add nsw i64 %shl, %z
+  %mul = mul nsw i64 %add, %add3
+  ret i64 %mul
+}
+
+define i64 @testzext3(i32 noundef %x, i64 noundef %y, i64 noundef %z) {
+; CHECK-LABEL: testzext3:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    ubfiz x8, x0, #3, #32
+; CHECK-NEXT:    add x9, x8, x1
+; CHECK-NEXT:    add x8, x8, x2
+; CHECK-NEXT:    mul x0, x9, x8
+; CHECK-NEXT:    ret
+entry:
+  %conv = zext i32 %x to i64
+  %shl = shl nsw i64 %conv, 3
+  %add = add nsw i64 %shl, %y
+  %add3 = add nsw i64 %shl, %z
+  %mul = mul nsw i64 %add, %add3
+  ret i64 %mul
+}
+
+define i64 @test3sext(i32 noundef %x, i64 noundef %y, i64 noundef %z) {
+; CHECK-LABEL: test3sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl w8, w0, #3
+; CHECK-NEXT:    sxtw x8, w8
+; CHECK-NEXT:    add x9, x8, x1
+; CHECK-NEXT:    add x8, x8, x2
+; CHECK-NEXT:    mul x0, x9, x8
+; CHECK-NEXT:    ret
+entry:
+  %shl = shl i32 %x, 3
+  %conv = sext i32 %shl to i64
+  %add = add nsw i64 %conv, %y
+  %add3 = add nsw i64 %conv, %z
+  %mul = mul nsw i64 %add, %add3
+  ret i64 %mul
+}
+
+define i64 @test3zext(i32 noundef %x, i64 noundef %y, i64 noundef %z) {
+; CHECK-LABEL: test3zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl w8, w0, #3
+; CHECK-NEXT:    add x9, x8, x1
+; CHECK-NEXT:    add x8, x8, x2
+; CHECK-NEXT:    mul x0, x9, x8
+; CHECK-NEXT:    ret
+entry:
+  %shl = shl i32 %x, 3
+  %conv = zext i32 %shl to i64
+  %add = add nsw i64 %conv, %y
+  %add3 = add nsw i64 %conv, %z
+  %mul = mul nsw i64 %add, %add3
+  ret i64 %mul
+}
diff --git a/llvm/test/CodeGen/AArch64/mul_pow2.ll b/llvm/test/CodeGen/AArch64/mul_pow2.ll
--- a/llvm/test/CodeGen/AArch64/mul_pow2.ll
+++ b/llvm/test/CodeGen/AArch64/mul_pow2.ll
@@ -493,7 +493,7 @@
   ret i32 %mul
 }
 
-define i32 @test25_fast_shift(i32 %x) "target-features"="+lsl-fast" {
+define i32 @test25_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
 ; CHECK-LABEL: test25_fast_shift:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    add w8, w0, w0, lsl #2
@@ -510,7 +510,7 @@
   ret i32 %mul
 }
 
-define i32 @test45_fast_shift(i32 %x) "target-features"="+lsl-fast" {
+define i32 @test45_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
 ; CHECK-LABEL: test45_fast_shift:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    add w8, w0, w0, lsl #2
@@ -546,7 +546,7 @@
 }
 
 ; Negative test: The shift amount 4 larger than 3
-define i32 @test85_fast_shift(i32 %x) "target-features"="+lsl-fast" {
+define i32 @test85_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
 ; CHECK-LABEL: test85_fast_shift:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #85
@@ -564,7 +564,7 @@
 }
 
 ; Negative test: The shift amount 5 larger than 3
-define i32 @test297_fast_shift(i32 %x) "target-features"="+lsl-fast" {
+define i32 @test297_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
 ; CHECK-LABEL: test297_fast_shift:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #297