diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -382,9 +382,13 @@ "equivalent when the immediate does " "not fit in the encoding.">; -def FeatureLSLFast : SubtargetFeature< - "lsl-fast", "HasLSLFast", "true", - "CPU has a fastpath logical shift of up to 3 places">; +def FeatureAddrLSLFast : SubtargetFeature< + "addr-lsl-fast", "HasAddrLSLFast", "true", + "Address operands with logical shift of up to 3 places are cheap">; + +def FeatureALULSLFast : SubtargetFeature< + "alu-lsl-fast", "HasALULSLFast", "true", + "Add/Sub operations with lsl shift <= 4 are cheap">; def FeatureAggressiveFMA : SubtargetFeature<"aggressive-fma", @@ -841,7 +845,8 @@ "Cortex-A76 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureLSLFast, + FeatureAddrLSLFast, + FeatureALULSLFast, FeatureEnableSelectOptimize, FeaturePredictableSelectIsExpensive]>; @@ -850,7 +855,8 @@ FeatureCmpBccFusion, FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureLSLFast, + FeatureAddrLSLFast, + FeatureALULSLFast, FeatureEnableSelectOptimize, FeaturePredictableSelectIsExpensive]>; @@ -859,7 +865,8 @@ FeatureCmpBccFusion, FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureLSLFast, + FeatureAddrLSLFast, + FeatureALULSLFast, FeaturePostRAScheduler, FeatureEnableSelectOptimize, FeaturePredictableSelectIsExpensive]>; @@ -870,7 +877,8 @@ FeatureCmpBccFusion, FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureLSLFast, + FeatureAddrLSLFast, + FeatureALULSLFast, FeaturePostRAScheduler, FeatureEnableSelectOptimize, FeaturePredictableSelectIsExpensive]>; @@ -880,7 +888,8 @@ FeatureCmpBccFusion, FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureLSLFast, + FeatureAddrLSLFast, + FeatureALULSLFast, FeaturePostRAScheduler, FeatureEnableSelectOptimize, FeaturePredictableSelectIsExpensive]>; @@ -890,7 +899,8 @@ FeatureFuseAES, FeaturePostRAScheduler, FeatureCmpBccFusion, - FeatureLSLFast, + FeatureAddrLSLFast, + FeatureALULSLFast, FeatureFuseAdrpAdd, FeatureEnableSelectOptimize, FeaturePredictableSelectIsExpensive]>; @@ -905,7 +915,8 @@ FeatureCmpBccFusion, FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureLSLFast, + FeatureAddrLSLFast, + FeatureALULSLFast, FeaturePostRAScheduler, FeatureEnableSelectOptimize, FeaturePredictableSelectIsExpensive]>; @@ -915,14 +926,16 @@ FeatureCmpBccFusion, FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureLSLFast, + FeatureAddrLSLFast, + FeatureALULSLFast, FeaturePostRAScheduler, FeatureEnableSelectOptimize, FeaturePredictableSelectIsExpensive]>; def TuneX3 : SubtargetFeature<"cortex-x3", "ARMProcFamily", "CortexX3", "Cortex-X3 ARM processors", [ - FeatureLSLFast, + FeatureAddrLSLFast, + FeatureALULSLFast, FeatureFuseAdrpAdd, FeatureFuseAES, FeaturePostRAScheduler, @@ -1060,7 +1073,8 @@ FeatureFuseCCSelect, FeatureFuseAdrpAdd, FeatureFuseLiterals, - FeatureLSLFast, + FeatureAddrLSLFast, + FeatureALULSLFast, FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive]>; @@ -1077,7 +1091,8 @@ FeatureFuseCCSelect, FeatureFuseAdrpAdd, FeatureFuseLiterals, - FeatureLSLFast, + FeatureAddrLSLFast, + FeatureALULSLFast, FeaturePostRAScheduler, FeatureZCZeroing]>; @@ -1087,7 +1102,8 @@ FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive, FeatureZCZeroing, - FeatureLSLFast] + FeatureAddrLSLFast, + FeatureALULSLFast] >; def TuneFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor", @@ -1096,7 +1112,8 @@ FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive, FeatureZCZeroing, - FeatureLSLFast, + FeatureAddrLSLFast, + FeatureALULSLFast, FeatureSlowSTRQro ]>; @@ -1110,7 +1127,8 @@ "Neoverse N1 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureLSLFast, + FeatureAddrLSLFast, + FeatureALULSLFast, FeaturePostRAScheduler, FeatureEnableSelectOptimize, FeaturePredictableSelectIsExpensive]>; @@ -1119,7 +1137,8 @@ "Neoverse N2 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureLSLFast, + FeatureAddrLSLFast, + FeatureALULSLFast, FeaturePostRAScheduler, FeatureEnableSelectOptimize, FeaturePredictableSelectIsExpensive]>; @@ -1128,7 +1147,8 @@ "Neoverse 512-TVB ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureLSLFast, + FeatureAddrLSLFast, + FeatureALULSLFast, FeaturePostRAScheduler, FeatureEnableSelectOptimize, FeaturePredictableSelectIsExpensive]>; @@ -1137,7 +1157,8 @@ "Neoverse V1 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureLSLFast, + FeatureAddrLSLFast, + FeatureALULSLFast, FeaturePostRAScheduler, FeatureEnableSelectOptimize, FeaturePredictableSelectIsExpensive, @@ -1147,7 +1168,8 @@ "Neoverse V2 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureLSLFast, + FeatureAddrLSLFast, + FeatureALULSLFast, FeaturePostRAScheduler, FeatureEnableSelectOptimize, FeaturePredictableSelectIsExpensive]>; @@ -1158,7 +1180,8 @@ FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive, FeatureZCZeroing, - FeatureLSLFast]>; + FeatureAddrLSLFast, + FeatureALULSLFast]>; def TuneThunderX2T99 : SubtargetFeature<"thunderx2t99", "ARMProcFamily", "ThunderX2T99", "Cavium ThunderX2 processors", [ @@ -1210,7 +1233,8 @@ "Ampere Computing Ampere-1 processors", [ FeaturePostRAScheduler, FeatureFuseAES, - FeatureLSLFast, + FeatureAddrLSLFast, + FeatureALULSLFast, FeatureAggressiveFMA, FeatureArithmeticBccFusion, FeatureCmpBccFusion, @@ -1221,7 +1245,8 @@ "Ampere Computing Ampere-1A processors", [ FeaturePostRAScheduler, FeatureFuseAES, - FeatureLSLFast, + FeatureAddrLSLFast, + FeatureALULSLFast, FeatureAggressiveFMA, FeatureArithmeticBccFusion, FeatureCmpBccFusion, diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -451,7 +451,8 @@ bool SelectAddrModeXRO(SDValue N, unsigned Size, SDValue &Base, SDValue &Offset, SDValue &SignExtend, SDValue &DoShift); - bool isWorthFolding(SDValue V) const; + bool isWorthFoldingALU(SDValue V, bool LSL = false) const; + bool isWorthFoldingAddr(SDValue V) const; bool SelectExtendedSHL(SDValue N, unsigned Size, bool WantExtend, SDValue &Offset, SDValue &SignExtend); @@ -660,18 +661,19 @@ return true; } -/// Determine whether it is worth to fold V into an extended register. -bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const { +/// Determine whether it is worth to fold V into an extended register addressing +/// mode. +bool AArch64DAGToDAGISel::isWorthFoldingAddr(SDValue V) const { // Trivial if we are optimizing for code size or if there is only // one use of the value. if (CurDAG->shouldOptForSize() || V.hasOneUse()) return true; // If a subtarget has a fastpath LSL we can fold a logical shift into // the addressing mode and save a cycle. - if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::SHL && + if (Subtarget->hasAddrLSLFast() && V.getOpcode() == ISD::SHL && isWorthFoldingSHL(V)) return true; - if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::ADD) { + if (Subtarget->hasAddrLSLFast() && V.getOpcode() == ISD::ADD) { const SDValue LHS = V.getOperand(0); const SDValue RHS = V.getOperand(1); if (LHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(LHS)) @@ -762,35 +764,6 @@ return true; } -/// SelectShiftedRegister - Select a "shifted register" operand. If the value -/// is not shifted, set the Shift operand to default of "LSL 0". The logical -/// instructions allow the shifted register to be rotated, but the arithmetic -/// instructions do not. The AllowROR parameter specifies whether ROR is -/// supported. -bool AArch64DAGToDAGISel::SelectShiftedRegister(SDValue N, bool AllowROR, - SDValue &Reg, SDValue &Shift) { - if (SelectShiftedRegisterFromAnd(N, Reg, Shift)) - return true; - - AArch64_AM::ShiftExtendType ShType = getShiftTypeForNode(N); - if (ShType == AArch64_AM::InvalidShiftExtend) - return false; - if (!AllowROR && ShType == AArch64_AM::ROR) - return false; - - if (ConstantSDNode *RHS = dyn_cast(N.getOperand(1))) { - unsigned BitSize = N.getValueSizeInBits(); - unsigned Val = RHS->getZExtValue() & (BitSize - 1); - unsigned ShVal = AArch64_AM::getShifterImm(ShType, Val); - - Reg = N.getOperand(0); - Shift = CurDAG->getTargetConstant(ShVal, SDLoc(N), MVT::i32); - return isWorthFolding(N); - } - - return false; -} - /// getExtendTypeForNode - Translate an extend node to the corresponding /// ExtendType value. static AArch64_AM::ShiftExtendType @@ -845,6 +818,56 @@ return AArch64_AM::InvalidShiftExtend; } +/// Determine whether it is worth to fold V into an extended register of an +/// Add/Sub. LSL means we are folding into an `add w0, w1, w2, lsl #N` +/// instruction, and the shift should be treated as worth folding even if has +/// multiple uses. +bool AArch64DAGToDAGISel::isWorthFoldingALU(SDValue V, bool LSL) const { + // Trivial if we are optimizing for code size or if there is only + // one use of the value. + if (CurDAG->shouldOptForSize() || V.hasOneUse()) + return true; + + // If a subtarget has a fastpath LSL we can fold a logical shift into + // the add/sub and save a cycle. + if (LSL && Subtarget->hasALULSLFast() && V.getOpcode() == ISD::SHL && + V.getConstantOperandVal(1) <= 4 && + getExtendTypeForNode(V.getOperand(0)) == AArch64_AM::InvalidShiftExtend) + return true; + + // It hurts otherwise, since the value will be reused. + return false; +} + +/// SelectShiftedRegister - Select a "shifted register" operand. If the value +/// is not shifted, set the Shift operand to default of "LSL 0". The logical +/// instructions allow the shifted register to be rotated, but the arithmetic +/// instructions do not. The AllowROR parameter specifies whether ROR is +/// supported. +bool AArch64DAGToDAGISel::SelectShiftedRegister(SDValue N, bool AllowROR, + SDValue &Reg, SDValue &Shift) { + if (SelectShiftedRegisterFromAnd(N, Reg, Shift)) + return true; + + AArch64_AM::ShiftExtendType ShType = getShiftTypeForNode(N); + if (ShType == AArch64_AM::InvalidShiftExtend) + return false; + if (!AllowROR && ShType == AArch64_AM::ROR) + return false; + + if (ConstantSDNode *RHS = dyn_cast(N.getOperand(1))) { + unsigned BitSize = N.getValueSizeInBits(); + unsigned Val = RHS->getZExtValue() & (BitSize - 1); + unsigned ShVal = AArch64_AM::getShifterImm(ShType, Val); + + Reg = N.getOperand(0); + Shift = CurDAG->getTargetConstant(ShVal, SDLoc(N), MVT::i32); + return isWorthFoldingALU(N, true); + } + + return false; +} + /// Instructions that accept extend modifiers like UXTW expect the register /// being extended to be a GPR32, but the incoming DAG might be acting on a /// GPR64 (either via SEXT_INREG or AND). Extract the appropriate low bits if @@ -925,7 +948,7 @@ Reg = narrowIfNeeded(CurDAG, Reg); Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), SDLoc(N), MVT::i32); - return isWorthFolding(N); + return isWorthFoldingALU(N); } /// SelectArithUXTXRegister - Select a "UXTX register" operand. This @@ -949,7 +972,7 @@ Reg = N.getOperand(0); Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), SDLoc(N), MVT::i32); - return isWorthFolding(N); + return isWorthFoldingALU(N); } /// If there's a use of this ADDlow that's not itself a load/store then we'll @@ -1164,7 +1187,7 @@ if (ShiftVal != 0 && ShiftVal != LegalShiftVal) return false; - return isWorthFolding(N); + return isWorthFoldingAddr(N); } bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size, @@ -1192,7 +1215,7 @@ } // Remember if it is worth folding N when it produces extended register. - bool IsExtendedRegisterWorthFolding = isWorthFolding(N); + bool IsExtendedRegisterWorthFolding = isWorthFoldingAddr(N); // Try to match a shifted extend on the RHS. if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL && @@ -1222,7 +1245,7 @@ Offset = narrowIfNeeded(CurDAG, LHS.getOperand(0)); SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl, MVT::i32); - if (isWorthFolding(LHS)) + if (isWorthFoldingAddr(LHS)) return true; } @@ -1234,7 +1257,7 @@ Offset = narrowIfNeeded(CurDAG, RHS.getOperand(0)); SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl, MVT::i32); - if (isWorthFolding(RHS)) + if (isWorthFoldingAddr(RHS)) return true; } @@ -1305,7 +1328,7 @@ } // Remember if it is worth folding N when it produces extended register. - bool IsExtendedRegisterWorthFolding = isWorthFolding(N); + bool IsExtendedRegisterWorthFolding = isWorthFoldingAddr(N); // Try to match a shifted extend on the RHS. if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL && diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -16454,7 +16454,7 @@ } else if (SCVPlus1.isPowerOf2()) { ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes; return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes)); - } else if (Subtarget->hasLSLFast() && + } else if (Subtarget->hasALULSLFast() && isPowPlusPlusConst(ConstValue, CVM, CVN)) { APInt CVMMinus1 = CVM - 1; APInt CVNMinus1 = CVN - 1; diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -6079,7 +6079,7 @@ // It's better to avoid folding and recomputing shifts when we don't have a // fastpath. - if (!STI.hasLSLFast()) + if (!STI.hasAddrLSLFast()) return false; // We have a fastpath, so folding a shift in and potentially computing it diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir b/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir --- a/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir @@ -24,7 +24,7 @@ define void @ldbbrox(i64* %addr) { ret void } define void @ldrqrox(i64* %addr) { ret void } attributes #0 = { optsize } - attributes #1 = { "target-features"="+lsl-fast" } + attributes #1 = { "target-features"="+addr-lsl-fast" } ... --- diff --git a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll --- a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK0 -; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+lsl-fast | FileCheck %s --check-prefixes=CHECK,CHECK3 +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+addr-lsl-fast | FileCheck %s --check-prefixes=CHECK,CHECK3 %struct.a = type [256 x i16] %struct.b = type [256 x i32] diff --git a/llvm/test/CodeGen/AArch64/lslfast.ll b/llvm/test/CodeGen/AArch64/lslfast.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/lslfast.ll @@ -0,0 +1,129 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK-SLOW +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+alu-lsl-fast | FileCheck %s --check-prefixes=CHECK,CHECK-FAST + +define i32 @testmul3(i32 noundef %x, i32 noundef %y, i32 noundef %z) { +; CHECK-SLOW-LABEL: testmul3: +; CHECK-SLOW: // %bb.0: // %entry +; CHECK-SLOW-NEXT: lsl w8, w0, #3 +; CHECK-SLOW-NEXT: add w9, w8, w1 +; CHECK-SLOW-NEXT: add w8, w8, w2 +; CHECK-SLOW-NEXT: mul w0, w8, w9 +; CHECK-SLOW-NEXT: ret +; +; CHECK-FAST-LABEL: testmul3: +; CHECK-FAST: // %bb.0: // %entry +; CHECK-FAST-NEXT: add w8, w1, w0, lsl #3 +; CHECK-FAST-NEXT: add w9, w2, w0, lsl #3 +; CHECK-FAST-NEXT: mul w0, w9, w8 +; CHECK-FAST-NEXT: ret +entry: + %shl = shl i32 %x, 3 + %add = add nsw i32 %shl, %y + %add2 = add nsw i32 %shl, %z + %mul = mul nsw i32 %add2, %add + ret i32 %mul +} + +define i32 @testvar(i32 noundef %x, i32 noundef %y, i32 noundef %z, i32 %zz) { +; CHECK-LABEL: testvar: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsl w8, w0, w3 +; CHECK-NEXT: add w9, w8, w1 +; CHECK-NEXT: add w8, w8, w2 +; CHECK-NEXT: mul w0, w8, w9 +; CHECK-NEXT: ret +entry: + %shl = shl i32 %x, %zz + %add = add nsw i32 %shl, %y + %add2 = add nsw i32 %shl, %z + %mul = mul nsw i32 %add2, %add + ret i32 %mul +} + +define i32 @testmul5(i32 noundef %x, i32 noundef %y, i32 noundef %z) { +; CHECK-LABEL: testmul5: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsl w8, w0, #5 +; CHECK-NEXT: add w9, w8, w1 +; CHECK-NEXT: add w8, w8, w2 +; CHECK-NEXT: mul w0, w8, w9 +; CHECK-NEXT: ret +entry: + %shl = shl i32 %x, 5 + %add = add nsw i32 %shl, %y + %add2 = add nsw i32 %shl, %z + %mul = mul nsw i32 %add2, %add + ret i32 %mul +} + +define i64 @testsext3(i32 noundef %x, i64 noundef %y, i64 noundef %z) { +; CHECK-LABEL: testsext3: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sbfiz x8, x0, #3, #32 +; CHECK-NEXT: add x9, x8, x1 +; CHECK-NEXT: add x8, x8, x2 +; CHECK-NEXT: mul x0, x9, x8 +; CHECK-NEXT: ret +entry: + %conv = sext i32 %x to i64 + %shl = shl nsw i64 %conv, 3 + %add = add nsw i64 %shl, %y + %add3 = add nsw i64 %shl, %z + %mul = mul nsw i64 %add, %add3 + ret i64 %mul +} + +define i64 @testzext3(i32 noundef %x, i64 noundef %y, i64 noundef %z) { +; CHECK-LABEL: testzext3: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: ubfiz x8, x0, #3, #32 +; CHECK-NEXT: add x9, x8, x1 +; CHECK-NEXT: add x8, x8, x2 +; CHECK-NEXT: mul x0, x9, x8 +; CHECK-NEXT: ret +entry: + %conv = zext i32 %x to i64 + %shl = shl nsw i64 %conv, 3 + %add = add nsw i64 %shl, %y + %add3 = add nsw i64 %shl, %z + %mul = mul nsw i64 %add, %add3 + ret i64 %mul +} + +define i64 @test3sext(i32 noundef %x, i64 noundef %y, i64 noundef %z) { +; CHECK-LABEL: test3sext: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsl w8, w0, #3 +; CHECK-NEXT: sxtw x8, w8 +; CHECK-NEXT: add x9, x8, x1 +; CHECK-NEXT: add x8, x8, x2 +; CHECK-NEXT: mul x0, x9, x8 +; CHECK-NEXT: ret +entry: + %shl = shl i32 %x, 3 + %conv = sext i32 %shl to i64 + %add = add nsw i64 %conv, %y + %add3 = add nsw i64 %conv, %z + %mul = mul nsw i64 %add, %add3 + ret i64 %mul +} + +define i64 @test3zext(i32 noundef %x, i64 noundef %y, i64 noundef %z) { +; CHECK-LABEL: test3zext: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsl w8, w0, #3 +; CHECK-NEXT: add x9, x8, x1 +; CHECK-NEXT: add x8, x8, x2 +; CHECK-NEXT: mul x0, x9, x8 +; CHECK-NEXT: ret +entry: + %shl = shl i32 %x, 3 + %conv = zext i32 %shl to i64 + %add = add nsw i64 %conv, %y + %add3 = add nsw i64 %conv, %z + %mul = mul nsw i64 %add, %add3 + ret i64 %mul +} diff --git a/llvm/test/CodeGen/AArch64/mul_pow2.ll b/llvm/test/CodeGen/AArch64/mul_pow2.ll --- a/llvm/test/CodeGen/AArch64/mul_pow2.ll +++ b/llvm/test/CodeGen/AArch64/mul_pow2.ll @@ -493,7 +493,7 @@ ret i32 %mul } -define i32 @test25_fast_shift(i32 %x) "target-features"="+lsl-fast" { +define i32 @test25_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" { ; CHECK-LABEL: test25_fast_shift: ; CHECK: // %bb.0: ; CHECK-NEXT: add w8, w0, w0, lsl #2 @@ -510,7 +510,7 @@ ret i32 %mul } -define i32 @test45_fast_shift(i32 %x) "target-features"="+lsl-fast" { +define i32 @test45_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" { ; CHECK-LABEL: test45_fast_shift: ; CHECK: // %bb.0: ; CHECK-NEXT: add w8, w0, w0, lsl #2 @@ -546,7 +546,7 @@ } ; Negative test: The shift amount 4 larger than 3 -define i32 @test85_fast_shift(i32 %x) "target-features"="+lsl-fast" { +define i32 @test85_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" { ; CHECK-LABEL: test85_fast_shift: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #85 @@ -564,7 +564,7 @@ } ; Negative test: The shift amount 5 larger than 3 -define i32 @test297_fast_shift(i32 %x) "target-features"="+lsl-fast" { +define i32 @test297_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" { ; CHECK-LABEL: test297_fast_shift: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #297