diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -383,6 +383,10 @@ "lsl-fast", "HasLSLFast", "true", "CPU has a fastpath logical shift of up to 3 places">; +def FeatureShiftBy2Slow : SubtargetFeature< + "shift-2-slow", "HasSlowShift2", "true", + "CPU needs an extra cycle if doing load/store with shift by 2 register offset">; + def FeatureAggressiveFMA : SubtargetFeature<"aggressive-fma", "HasAggressiveFMA", @@ -759,6 +763,7 @@ FeatureFuseAdrpAdd, FeatureBalanceFPOps, FeatureCustomCheapAsMoveHandling, + FeatureShiftBy2Slow, FeaturePostRAScheduler]>; def TuneA55 : SubtargetFeature<"a55", "ARMProcFamily", "CortexA55", @@ -766,6 +771,7 @@ FeatureFuseAES, FeatureFuseAdrpAdd, FeaturePostRAScheduler, + FeatureShiftBy2Slow, FeatureFuseAddress]>; def TuneA510 : SubtargetFeature<"a510", "ARMProcFamily", "CortexA510", @@ -815,6 +821,7 @@ FeatureFuseAES, FeatureFuseAdrpAdd, FeatureEnableSelectOptimize, + FeatureShiftBy2Slow, FeaturePredictableSelectIsExpensive]>; def TuneA76 : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76", @@ -1118,6 +1125,7 @@ FeatureFuseAES, FeatureFuseAdrpAdd, FeatureLSLFast, + FeatureShiftBy2Slow, FeaturePostRAScheduler, FeatureEnableSelectOptimize, FeaturePredictableSelectIsExpensive]>; diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -452,7 +452,7 @@ bool SelectAddrModeXRO(SDValue N, unsigned Size, SDValue &Base, SDValue &Offset, SDValue &SignExtend, SDValue &DoShift); - bool isWorthFolding(SDValue V) const; + bool isWorthFolding(SDValue V, bool FoldToBaseAddr) const; bool SelectExtendedSHL(SDValue N, unsigned Size, bool WantExtend, SDValue &Offset, SDValue &SignExtend); @@ -631,45 +631,56 @@ /// Determine whether it is worth it to fold SHL into the addressing /// mode. -static bool isWorthFoldingSHL(SDValue V) { +static bool isWorthFoldingSHL(SDValue V, bool FoldToBaseAddr = false, + bool SlowShift = false) { assert(V.getOpcode() == ISD::SHL && "invalid opcode"); // It is worth folding logical shift of up to three places. auto *CSD = dyn_cast(V.getOperand(1)); if (!CSD) return false; unsigned ShiftVal = CSD->getZExtValue(); + // Older Arm CPUs perform this load variant slower than other types + if ((ShiftVal == 1) && SlowShift) + return false; if (ShiftVal > 3) return false; // Check if this particular node is reused in any non-memory related // operation. If yes, do not try to fold this node into the address // computation, since the computation will be kept. - const SDNode *Node = V.getNode(); - for (SDNode *UI : Node->uses()) - if (!isa(*UI)) - for (SDNode *UII : UI->uses()) - if (!isa(*UII)) - return false; + if (!FoldToBaseAddr) { + const SDNode *Node = V.getNode(); + for (SDNode *UI : Node->uses()) + if (!isa(*UI)) + for (SDNode *UII : UI->uses()) + if (!isa(*UII)) + return false; + } return true; } /// Determine whether it is worth to fold V into an extended register. -bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const { +bool AArch64DAGToDAGISel::isWorthFolding(SDValue V, + bool FoldToBaseAddr = false) const { + bool AllowLSLFast = Subtarget->hasLSLFast() || FoldToBaseAddr; + bool ShiftBy2Slow = Subtarget->hasSlowShift2(); // Trivial if we are optimizing for code size or if there is only // one use of the value. if (CurDAG->shouldOptForSize() || V.hasOneUse()) return true; // If a subtarget has a fastpath LSL we can fold a logical shift into // the addressing mode and save a cycle. - if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::SHL && - isWorthFoldingSHL(V)) + if (AllowLSLFast && V.getOpcode() == ISD::SHL && + isWorthFoldingSHL(V, FoldToBaseAddr, ShiftBy2Slow)) return true; - if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::ADD) { + if (AllowLSLFast && V.getOpcode() == ISD::ADD) { const SDValue LHS = V.getOperand(0); const SDValue RHS = V.getOperand(1); - if (LHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(LHS)) + if (LHS.getOpcode() == ISD::SHL && + isWorthFoldingSHL(LHS, FoldToBaseAddr, ShiftBy2Slow)) return true; - if (RHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(RHS)) + if (RHS.getOpcode() == ISD::SHL && + isWorthFoldingSHL(RHS, FoldToBaseAddr, ShiftBy2Slow)) return true; } @@ -1185,7 +1196,8 @@ } // Remember if it is worth folding N when it produces extended register. - bool IsExtendedRegisterWorthFolding = isWorthFolding(N); + bool IsExtendedRegisterWorthFolding = + isWorthFolding(N, /* FoldtoBaseAddr */ true); // Try to match a shifted extend on the RHS. if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL && @@ -1298,7 +1310,8 @@ } // Remember if it is worth folding N when it produces extended register. - bool IsExtendedRegisterWorthFolding = isWorthFolding(N); + bool IsExtendedRegisterWorthFolding = + isWorthFolding(N, /*FoldToBaseAddr*/ true); // Try to match a shifted extend on the RHS. if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL && diff --git a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll --- a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK0 -; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+lsl-fast | FileCheck %s --check-prefixes=CHECK,CHECK3 +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+lsl-fast -mattr=+shift-2-slow | FileCheck %s --check-prefixes=CHECK,CHECK1 %struct.a = type [256 x i16] %struct.b = type [256 x i32] @@ -12,32 +12,32 @@ ; CHECK0: // %bb.0: ; CHECK0-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill ; CHECK0-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK0-NEXT: ubfx x8, x1, #9, #8 +; CHECK0-NEXT: ubfx x21, x1, #9, #8 ; CHECK0-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; CHECK0-NEXT: lsl x21, x8, #1 ; CHECK0-NEXT: mov x19, x0 -; CHECK0-NEXT: ldrh w20, [x0, x21] +; CHECK0-NEXT: ldrh w20, [x0, x21, lsl #1] ; CHECK0-NEXT: bl foo ; CHECK0-NEXT: mov w0, w20 -; CHECK0-NEXT: strh w20, [x19, x21] +; CHECK0-NEXT: strh w20, [x19, x21, lsl #1] ; CHECK0-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; CHECK0-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; CHECK0-NEXT: ret ; -; CHECK3-LABEL: halfword: -; CHECK3: // %bb.0: -; CHECK3-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill -; CHECK3-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK3-NEXT: ubfx x21, x1, #9, #8 -; CHECK3-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; CHECK3-NEXT: mov x19, x0 -; CHECK3-NEXT: ldrh w20, [x0, x21, lsl #1] -; CHECK3-NEXT: bl foo -; CHECK3-NEXT: mov w0, w20 -; CHECK3-NEXT: strh w20, [x19, x21, lsl #1] -; CHECK3-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK3-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload -; CHECK3-NEXT: ret +; CHECK1-LABEL: halfword: +; CHECK1: // %bb.0: +; CHECK1-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; CHECK1-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK1-NEXT: ubfx x8, x1, #9, #8 +; CHECK1-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK1-NEXT: lsl x21, x8, #1 +; CHECK1-NEXT: mov x19, x0 +; CHECK1-NEXT: ldrh w20, [x0, x21] +; CHECK1-NEXT: bl foo +; CHECK1-NEXT: mov w0, w20 +; CHECK1-NEXT: strh w20, [x19, x21] +; CHECK1-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK1-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; CHECK1-NEXT: ret %shr81 = lshr i32 %xor72, 9 %conv82 = zext i32 %shr81 to i64 %idxprom83 = and i64 %conv82, 255 @@ -49,36 +49,20 @@ } define i32 @word(ptr %ctx, i32 %xor72) nounwind { -; CHECK0-LABEL: word: -; CHECK0: // %bb.0: -; CHECK0-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill -; CHECK0-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK0-NEXT: ubfx x8, x1, #9, #8 -; CHECK0-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; CHECK0-NEXT: lsl x21, x8, #2 -; CHECK0-NEXT: mov x19, x0 -; CHECK0-NEXT: ldr w20, [x0, x21] -; CHECK0-NEXT: bl foo -; CHECK0-NEXT: mov w0, w20 -; CHECK0-NEXT: str w20, [x19, x21] -; CHECK0-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK0-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload -; CHECK0-NEXT: ret -; -; CHECK3-LABEL: word: -; CHECK3: // %bb.0: -; CHECK3-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill -; CHECK3-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK3-NEXT: ubfx x21, x1, #9, #8 -; CHECK3-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; CHECK3-NEXT: mov x19, x0 -; CHECK3-NEXT: ldr w20, [x0, x21, lsl #2] -; CHECK3-NEXT: bl foo -; CHECK3-NEXT: mov w0, w20 -; CHECK3-NEXT: str w20, [x19, x21, lsl #2] -; CHECK3-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK3-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload -; CHECK3-NEXT: ret +; CHECK-LABEL: word: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: ubfx x21, x1, #9, #8 +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: ldr w20, [x0, x21, lsl #2] +; CHECK-NEXT: bl foo +; CHECK-NEXT: mov w0, w20 +; CHECK-NEXT: str w20, [x19, x21, lsl #2] +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret %shr81 = lshr i32 %xor72, 9 %conv82 = zext i32 %shr81 to i64 %idxprom83 = and i64 %conv82, 255 @@ -90,36 +74,20 @@ } define i64 @doubleword(ptr %ctx, i32 %xor72) nounwind { -; CHECK0-LABEL: doubleword: -; CHECK0: // %bb.0: -; CHECK0-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill -; CHECK0-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK0-NEXT: ubfx x8, x1, #9, #8 -; CHECK0-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; CHECK0-NEXT: lsl x21, x8, #3 -; CHECK0-NEXT: mov x19, x0 -; CHECK0-NEXT: ldr x20, [x0, x21] -; CHECK0-NEXT: bl foo -; CHECK0-NEXT: mov x0, x20 -; CHECK0-NEXT: str x20, [x19, x21] -; CHECK0-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK0-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload -; CHECK0-NEXT: ret -; -; CHECK3-LABEL: doubleword: -; CHECK3: // %bb.0: -; CHECK3-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill -; CHECK3-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK3-NEXT: ubfx x21, x1, #9, #8 -; CHECK3-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; CHECK3-NEXT: mov x19, x0 -; CHECK3-NEXT: ldr x20, [x0, x21, lsl #3] -; CHECK3-NEXT: bl foo -; CHECK3-NEXT: mov x0, x20 -; CHECK3-NEXT: str x20, [x19, x21, lsl #3] -; CHECK3-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK3-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload -; CHECK3-NEXT: ret +; CHECK-LABEL: doubleword: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: ubfx x21, x1, #9, #8 +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: ldr x20, [x0, x21, lsl #3] +; CHECK-NEXT: bl foo +; CHECK-NEXT: mov x0, x20 +; CHECK-NEXT: str x20, [x19, x21, lsl #3] +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret %shr81 = lshr i32 %xor72, 9 %conv82 = zext i32 %shr81 to i64 %idxprom83 = and i64 %conv82, 255 @@ -163,20 +131,12 @@ } define i64 @gep3(ptr %p, i64 %b) { -; CHECK0-LABEL: gep3: -; CHECK0: // %bb.0: -; CHECK0-NEXT: lsl x9, x1, #3 -; CHECK0-NEXT: mov x8, x0 -; CHECK0-NEXT: ldr x0, [x0, x9] -; CHECK0-NEXT: str x1, [x8, x9] -; CHECK0-NEXT: ret -; -; CHECK3-LABEL: gep3: -; CHECK3: // %bb.0: -; CHECK3-NEXT: mov x8, x0 -; CHECK3-NEXT: ldr x0, [x0, x1, lsl #3] -; CHECK3-NEXT: str x1, [x8, x1, lsl #3] -; CHECK3-NEXT: ret +; CHECK-LABEL: gep3: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: ldr x0, [x0, x1, lsl #3] +; CHECK-NEXT: str x1, [x8, x1, lsl #3] +; CHECK-NEXT: ret %g = getelementptr inbounds i64, ptr %p, i64 %b %l = load i64, ptr %g store i64 %b, ptr %g diff --git a/llvm/test/CodeGen/AArch64/arm64-addr-mode-folding.ll b/llvm/test/CodeGen/AArch64/arm64-addr-mode-folding.ll --- a/llvm/test/CodeGen/AArch64/arm64-addr-mode-folding.ll +++ b/llvm/test/CodeGen/AArch64/arm64-addr-mode-folding.ll @@ -125,7 +125,6 @@ } ; CHECK: @test -; CHECK-NOT: , uxtw #2] define i32 @test(ptr %array, i8 zeroext %c, i32 %arg) { entry: %conv = zext i8 %c to i32 diff --git a/llvm/test/CodeGen/AArch64/extract-bits.ll b/llvm/test/CodeGen/AArch64/extract-bits.ll --- a/llvm/test/CodeGen/AArch64/extract-bits.ll +++ b/llvm/test/CodeGen/AArch64/extract-bits.ll @@ -21,7 +21,7 @@ define i32 @bextr32_a0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { ; CHECK-LABEL: bextr32_a0: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: lsr w9, w0, w1 ; CHECK-NEXT: lsl w8, w8, w2 ; CHECK-NEXT: sub w8, w8, #1 @@ -37,7 +37,7 @@ define i32 @bextr32_a0_arithmetic(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { ; CHECK-LABEL: bextr32_a0_arithmetic: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: asr w9, w0, w1 ; CHECK-NEXT: lsl w8, w8, w2 ; CHECK-NEXT: sub w8, w8, #1 @@ -53,7 +53,7 @@ define i32 @bextr32_a1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind { ; CHECK-LABEL: bextr32_a1_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: lsr w9, w0, w1 ; CHECK-NEXT: lsl w8, w8, w2 ; CHECK-NEXT: sub w8, w8, #1 @@ -72,7 +72,7 @@ ; CHECK-LABEL: bextr32_a2_load: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr w9, [x0] -; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: lsl w8, w8, w2 ; CHECK-NEXT: sub w8, w8, #1 ; CHECK-NEXT: lsr w9, w9, w1 @@ -90,7 +90,7 @@ ; CHECK-LABEL: bextr32_a3_load_indexzext: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr w9, [x0] -; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: lsl w8, w8, w2 ; CHECK-NEXT: sub w8, w8, #1 ; CHECK-NEXT: lsr w9, w9, w1 @@ -109,7 +109,7 @@ define i32 @bextr32_a4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { ; CHECK-LABEL: bextr32_a4_commutative: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: lsr w9, w0, w1 ; CHECK-NEXT: lsl w8, w8, w2 ; CHECK-NEXT: sub w8, w8, #1 @@ -127,7 +127,7 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_a0: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: lsr x9, x0, x1 ; CHECK-NEXT: lsl x8, x8, x2 ; CHECK-NEXT: sub x8, x8, #1 @@ -143,7 +143,7 @@ define i64 @bextr64_a0_arithmetic(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_a0_arithmetic: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: asr x9, x0, x1 ; CHECK-NEXT: lsl x8, x8, x2 ; CHECK-NEXT: sub x8, x8, #1 @@ -159,7 +159,7 @@ define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind { ; CHECK-LABEL: bextr64_a1_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: lsr x9, x0, x1 @@ -180,7 +180,7 @@ ; CHECK-LABEL: bextr64_a2_load: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr x9, [x0] -; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: lsl x8, x8, x2 ; CHECK-NEXT: sub x8, x8, #1 ; CHECK-NEXT: lsr x9, x9, x1 @@ -198,7 +198,7 @@ ; CHECK-LABEL: bextr64_a3_load_indexzext: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr x9, [x0] -; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: lsl x8, x8, x2 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 @@ -219,7 +219,7 @@ define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_a4_commutative: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: lsr x9, x0, x1 ; CHECK-NEXT: lsl x8, x8, x2 ; CHECK-NEXT: sub x8, x8, #1 @@ -238,7 +238,7 @@ define i32 @bextr64_32_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_32_a0: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: lsr x9, x0, x1 ; CHECK-NEXT: lsl x8, x8, x2 ; CHECK-NEXT: sub w8, w8, #1 @@ -256,7 +256,7 @@ define i32 @bextr64_32_a1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_32_a1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: lsr x9, x0, x1 ; CHECK-NEXT: lsl w8, w8, w2 ; CHECK-NEXT: sub w8, w8, #1 @@ -275,7 +275,7 @@ define i32 @bextr64_32_a2(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_32_a2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: lsr x9, x0, x1 ; CHECK-NEXT: lsl w8, w8, w2 ; CHECK-NEXT: sub w8, w8, #1 @@ -297,7 +297,7 @@ define i32 @bextr32_b0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { ; CHECK-LABEL: bextr32_b0: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #-1 +; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: lsr w9, w0, w1 ; CHECK-NEXT: lsl w8, w8, w2 ; CHECK-NEXT: bic w0, w9, w8 @@ -312,7 +312,7 @@ define i32 @bextr32_b1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind { ; CHECK-LABEL: bextr32_b1_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #-1 +; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: lsr w9, w0, w1 ; CHECK-NEXT: lsl w8, w8, w2 ; CHECK-NEXT: bic w0, w9, w8 @@ -330,7 +330,7 @@ ; CHECK-LABEL: bextr32_b2_load: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr w9, [x0] -; CHECK-NEXT: mov w8, #-1 +; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: lsl w8, w8, w2 ; CHECK-NEXT: lsr w9, w9, w1 ; CHECK-NEXT: bic w0, w9, w8 @@ -347,7 +347,7 @@ ; CHECK-LABEL: bextr32_b3_load_indexzext: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr w9, [x0] -; CHECK-NEXT: mov w8, #-1 +; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: lsl w8, w8, w2 ; CHECK-NEXT: lsr w9, w9, w1 ; CHECK-NEXT: bic w0, w9, w8 @@ -365,7 +365,7 @@ define i32 @bextr32_b4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { ; CHECK-LABEL: bextr32_b4_commutative: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #-1 +; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: lsr w9, w0, w1 ; CHECK-NEXT: lsl w8, w8, w2 ; CHECK-NEXT: bic w0, w9, w8 @@ -382,7 +382,7 @@ define i64 @bextr64_b0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_b0: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #-1 +; CHECK-NEXT: mov x8, #-1 // =0xffffffffffffffff ; CHECK-NEXT: lsr x9, x0, x1 ; CHECK-NEXT: lsl x8, x8, x2 ; CHECK-NEXT: bic x0, x9, x8 @@ -397,7 +397,7 @@ define i64 @bextr64_b1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind { ; CHECK-LABEL: bextr64_b1_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #-1 +; CHECK-NEXT: mov x8, #-1 // =0xffffffffffffffff ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: lsr x9, x0, x1 @@ -417,7 +417,7 @@ ; CHECK-LABEL: bextr64_b2_load: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr x9, [x0] -; CHECK-NEXT: mov x8, #-1 +; CHECK-NEXT: mov x8, #-1 // =0xffffffffffffffff ; CHECK-NEXT: lsl x8, x8, x2 ; CHECK-NEXT: lsr x9, x9, x1 ; CHECK-NEXT: bic x0, x9, x8 @@ -434,7 +434,7 @@ ; CHECK-LABEL: bextr64_b3_load_indexzext: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr x9, [x0] -; CHECK-NEXT: mov x8, #-1 +; CHECK-NEXT: mov x8, #-1 // =0xffffffffffffffff ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: lsl x8, x8, x2 @@ -454,7 +454,7 @@ define i64 @bextr64_b4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_b4_commutative: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #-1 +; CHECK-NEXT: mov x8, #-1 // =0xffffffffffffffff ; CHECK-NEXT: lsr x9, x0, x1 ; CHECK-NEXT: lsl x8, x8, x2 ; CHECK-NEXT: bic x0, x9, x8 @@ -472,7 +472,7 @@ define i32 @bextr64_32_b0(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_32_b0: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #-1 +; CHECK-NEXT: mov x8, #-1 // =0xffffffffffffffff ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: lsr x9, x0, x1 ; CHECK-NEXT: lsl x8, x8, x2 @@ -491,7 +491,7 @@ define i32 @bextr64_32_b1(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_32_b1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #-1 +; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: lsr x9, x0, x1 ; CHECK-NEXT: lsl w8, w8, w2 @@ -511,7 +511,7 @@ define i32 @bextr64_32_b2(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_32_b2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #-1 +; CHECK-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: lsr x9, x0, x1 ; CHECK-NEXT: lsl w8, w8, w2 @@ -535,7 +535,7 @@ ; CHECK-LABEL: bextr32_c0: ; CHECK: // %bb.0: ; CHECK-NEXT: neg w8, w2 -; CHECK-NEXT: mov w9, #-1 +; CHECK-NEXT: mov w9, #-1 // =0xffffffff ; CHECK-NEXT: lsr w10, w0, w1 ; CHECK-NEXT: lsr w8, w9, w8 ; CHECK-NEXT: and w0, w8, w10 @@ -550,8 +550,8 @@ define i32 @bextr32_c1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) nounwind { ; CHECK-LABEL: bextr32_c1_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #32 -; CHECK-NEXT: mov w9, #-1 +; CHECK-NEXT: mov w8, #32 // =0x20 +; CHECK-NEXT: mov w9, #-1 // =0xffffffff ; CHECK-NEXT: sub w8, w8, w2 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: lsr w10, w0, w1 @@ -572,7 +572,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: neg w8, w2 ; CHECK-NEXT: ldr w9, [x0] -; CHECK-NEXT: mov w10, #-1 +; CHECK-NEXT: mov w10, #-1 // =0xffffffff ; CHECK-NEXT: lsr w9, w9, w1 ; CHECK-NEXT: lsr w8, w10, w8 ; CHECK-NEXT: and w0, w8, w9 @@ -588,10 +588,10 @@ define i32 @bextr32_c3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) nounwind { ; CHECK-LABEL: bextr32_c3_load_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #32 +; CHECK-NEXT: mov w8, #32 // =0x20 ; CHECK-NEXT: ldr w9, [x0] ; CHECK-NEXT: sub w8, w8, w2 -; CHECK-NEXT: mov w10, #-1 +; CHECK-NEXT: mov w10, #-1 // =0xffffffff ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: lsr w9, w9, w1 ; CHECK-NEXT: lsr w8, w10, w8 @@ -611,7 +611,7 @@ ; CHECK-LABEL: bextr32_c4_commutative: ; CHECK: // %bb.0: ; CHECK-NEXT: neg w8, w2 -; CHECK-NEXT: mov w9, #-1 +; CHECK-NEXT: mov w9, #-1 // =0xffffffff ; CHECK-NEXT: lsr w10, w0, w1 ; CHECK-NEXT: lsr w8, w9, w8 ; CHECK-NEXT: and w0, w10, w8 @@ -629,7 +629,7 @@ ; CHECK-LABEL: bextr64_c0: ; CHECK: // %bb.0: ; CHECK-NEXT: neg x8, x2 -; CHECK-NEXT: mov x9, #-1 +; CHECK-NEXT: mov x9, #-1 // =0xffffffffffffffff ; CHECK-NEXT: lsr x10, x0, x1 ; CHECK-NEXT: lsr x8, x9, x8 ; CHECK-NEXT: and x0, x8, x10 @@ -644,8 +644,8 @@ define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_c1_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #64 -; CHECK-NEXT: mov x9, #-1 +; CHECK-NEXT: mov w8, #64 // =0x40 +; CHECK-NEXT: mov x9, #-1 // =0xffffffffffffffff ; CHECK-NEXT: sub w8, w8, w2 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: lsr x10, x0, x1 @@ -666,7 +666,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: neg x8, x2 ; CHECK-NEXT: ldr x9, [x0] -; CHECK-NEXT: mov x10, #-1 +; CHECK-NEXT: mov x10, #-1 // =0xffffffffffffffff ; CHECK-NEXT: lsr x9, x9, x1 ; CHECK-NEXT: lsr x8, x10, x8 ; CHECK-NEXT: and x0, x8, x9 @@ -682,10 +682,10 @@ define i64 @bextr64_c3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_c3_load_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #64 +; CHECK-NEXT: mov w8, #64 // =0x40 ; CHECK-NEXT: ldr x9, [x0] ; CHECK-NEXT: sub w8, w8, w2 -; CHECK-NEXT: mov x10, #-1 +; CHECK-NEXT: mov x10, #-1 // =0xffffffffffffffff ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: lsr x9, x9, x1 ; CHECK-NEXT: lsr x8, x10, x8 @@ -705,7 +705,7 @@ ; CHECK-LABEL: bextr64_c4_commutative: ; CHECK: // %bb.0: ; CHECK-NEXT: neg x8, x2 -; CHECK-NEXT: mov x9, #-1 +; CHECK-NEXT: mov x9, #-1 // =0xffffffffffffffff ; CHECK-NEXT: lsr x10, x0, x1 ; CHECK-NEXT: lsr x8, x9, x8 ; CHECK-NEXT: and x0, x10, x8 @@ -724,7 +724,7 @@ ; CHECK-LABEL: bextr64_32_c0: ; CHECK: // %bb.0: ; CHECK-NEXT: neg x8, x2 -; CHECK-NEXT: mov x9, #-1 +; CHECK-NEXT: mov x9, #-1 // =0xffffffffffffffff ; CHECK-NEXT: lsr x10, x0, x1 ; CHECK-NEXT: lsr x8, x9, x8 ; CHECK-NEXT: and w0, w8, w10 @@ -742,7 +742,7 @@ ; CHECK-LABEL: bextr64_32_c1: ; CHECK: // %bb.0: ; CHECK-NEXT: neg w8, w2 -; CHECK-NEXT: mov w9, #-1 +; CHECK-NEXT: mov w9, #-1 // =0xffffffff ; CHECK-NEXT: lsr x10, x0, x1 ; CHECK-NEXT: lsr w8, w9, w8 ; CHECK-NEXT: and w0, w8, w10 @@ -761,7 +761,7 @@ ; CHECK-LABEL: bextr64_32_c2: ; CHECK: // %bb.0: ; CHECK-NEXT: neg w8, w2 -; CHECK-NEXT: mov w9, #-1 +; CHECK-NEXT: mov w9, #-1 // =0xffffffff ; CHECK-NEXT: lsr x10, x0, x1 ; CHECK-NEXT: lsr w8, w9, w8 ; CHECK-NEXT: and w0, w8, w10 @@ -797,7 +797,7 @@ define i32 @bextr32_d1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) nounwind { ; CHECK-LABEL: bextr32_d1_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #32 +; CHECK-NEXT: mov w8, #32 // =0x20 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: lsr w9, w0, w1 ; CHECK-NEXT: sub w8, w8, w2 @@ -833,7 +833,7 @@ define i32 @bextr32_d3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) nounwind { ; CHECK-LABEL: bextr32_d3_load_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #32 +; CHECK-NEXT: mov w8, #32 // =0x20 ; CHECK-NEXT: ldr w9, [x0] ; CHECK-NEXT: sub w8, w8, w2 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 @@ -871,7 +871,7 @@ define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_d1_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #64 +; CHECK-NEXT: mov w8, #64 // =0x40 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: lsr x9, x0, x1 ; CHECK-NEXT: sub w8, w8, w2 @@ -907,7 +907,7 @@ define i64 @bextr64_d3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_d3_load_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #64 +; CHECK-NEXT: mov w8, #64 // =0x40 ; CHECK-NEXT: ldr x9, [x0] ; CHECK-NEXT: sub w8, w8, w2 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 @@ -972,10 +972,9 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldr x8, [x1] ; CHECK-NEXT: ubfx x8, x8, #21, #10 -; CHECK-NEXT: lsl x8, x8, #2 -; CHECK-NEXT: ldr w9, [x0, x8] +; CHECK-NEXT: ldr w9, [x0, x8, lsl #2] ; CHECK-NEXT: add w9, w9, #1 -; CHECK-NEXT: str w9, [x0, x8] +; CHECK-NEXT: str w9, [x0, x8, lsl #2] ; CHECK-NEXT: ret %tmp = load i64, ptr %a1, align 8 %tmp1 = lshr i64 %tmp, 21