diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -2803,6 +2803,122 @@ return true; } +static bool isWorthFoldingIntoOrrWithLeftShift(SDValue Dst, + SelectionDAG *CurDAG, + SDValue &LeftShiftedOperand, + uint64_t &LeftShiftAmount) { + // Avoid folding Dst into ORR-with-left-shift if Dst has other uses than ORR. + if (!Dst.hasOneUse()) + return false; + + EVT VT = Dst.getValueType(); + assert((VT == MVT::i32 || VT == MVT::i64) && + "Caller should guarantee that VT is one of i32 or i64"); + const unsigned SizeInBits = VT.getSizeInBits(); + + SDLoc DL(Dst.getNode()); + uint64_t AndImm, ShlImm; + if (isOpcWithIntImmediate(Dst.getNode(), ISD::AND, AndImm) && + isShiftedMask_64(AndImm)) { + // Avoid transforming 'DstOp0' if it has other uses than the AND node. + SDValue DstOp0 = Dst.getOperand(0); + if (!DstOp0.hasOneUse()) + return false; + + // An example to illustrate the transformation + // From: + // lsr x8, x1, #1 + // and x8, x8, #0x3f80 + // bfxil x8, x1, #0, #7 + // To: + // and x8, x23, #0x7f + // ubfx x9, x23, #8, #7 + // orr x23, x8, x9, lsl #7 + // + // The number of instructions remains the same, but ORR is faster than BFXIL + // on many AArch64 processors (or as good as BFXIL if not faster). Besides, + // the dependency chain is improved after the transformation. + uint64_t SrlImm; + if (isOpcWithIntImmediate(DstOp0.getNode(), ISD::SRL, SrlImm)) { + uint64_t NumTrailingZeroInShiftedMask = countTrailingZeros(AndImm); + if ((SrlImm + NumTrailingZeroInShiftedMask) < SizeInBits) { + unsigned MaskWidth = + countTrailingOnes(AndImm >> NumTrailingZeroInShiftedMask); + unsigned UBFMOpc = + (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri; + SDNode *UBFMNode = CurDAG->getMachineNode( + UBFMOpc, DL, VT, DstOp0.getOperand(0), + CurDAG->getTargetConstant(SrlImm + NumTrailingZeroInShiftedMask, DL, + VT), + CurDAG->getTargetConstant( + SrlImm + NumTrailingZeroInShiftedMask + MaskWidth - 1, DL, VT)); + LeftShiftedOperand = SDValue(UBFMNode, 0); + LeftShiftAmount = NumTrailingZeroInShiftedMask; + return true; + } + } + } else if (isOpcWithIntImmediate(Dst.getNode(), ISD::SHL, ShlImm)) { + LeftShiftedOperand = Dst.getOperand(0); + LeftShiftAmount = ShlImm; + return true; + } + // FIXME: Extend the implementation to optimize if Dst is an SRL node. + return false; +} + +static bool tryOrrWithLeftShift(SDNode *N, SDValue OrOpd0, SDValue OrOpd1, + SDValue Src, SDValue Dst, SelectionDAG *CurDAG, + const bool BiggerPattern) { + EVT VT = N->getValueType(0); + assert((VT == MVT::i32 || VT == MVT::i64) && + "Expect result type to be i32 or i64 since N is combinable to BFM"); + SDLoc DL(N); + + // Bail out if BFM simplifies away one node in BFM Dst. + if (OrOpd1 != Dst) + return false; + + // For "BFM Rd, Rn, #immr, #imms", it's known that BFM simplifies away fewer + // nodes from Rn (or inserts additional shift node) if BiggerPattern is true. + if (BiggerPattern) { + uint64_t SrcAndImm; + if (isOpcWithIntImmediate(OrOpd0.getNode(), ISD::AND, SrcAndImm) && + isMask_64(SrcAndImm) && OrOpd0.getOperand(0) == Src) { + // OrOpd0 = AND Src, #Mask + // So BFM simplifies away one AND node from Src and doesn't simplify away + // nodes from Dst. If ORR with left-shifted operand also simplifies away + // one node (from Rd), ORR is better since it has higher throughput and + // smaller latency than BFM on many AArch64 processors (and for the rest + // ORR is at least as good as BFM). + SDValue LeftShiftedOperand; + uint64_t LeftShiftAmount; + if (isWorthFoldingIntoOrrWithLeftShift(Dst, CurDAG, LeftShiftedOperand, + LeftShiftAmount)) { + unsigned OrrOpc = (VT == MVT::i32) ? AArch64::ORRWrs : AArch64::ORRXrs; + SDValue Ops[] = {OrOpd0, LeftShiftedOperand, + CurDAG->getTargetConstant(LeftShiftAmount, DL, VT)}; + CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops); + return true; + } + } + return false; + } + + assert((!BiggerPattern) && "BiggerPattern should be handled above"); + + uint64_t ShlImm; + // FIXME: Extend the implementation if OrOpd0 is an SRL node. + if (isOpcWithIntImmediate(OrOpd0.getNode(), ISD::SHL, ShlImm) && + OrOpd0.getOperand(0) == Src && OrOpd0.hasOneUse()) { + unsigned OrrOpc = (VT == MVT::i32) ? AArch64::ORRWrs : AArch64::ORRXrs; + SDValue Ops[] = {Dst, Src, CurDAG->getTargetConstant(ShlImm, DL, VT)}; + CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops); + return true; + } + + return false; +} + static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits, SelectionDAG *CurDAG) { assert(N->getOpcode() == ISD::OR && "Expect a OR operation"); @@ -2905,6 +3021,14 @@ // or is useful because it discards more bits Dst = OrOpd1Val; + // Before selecting ISD::OR node to AArch64::BFM, see if an AArch64::ORR + // with left-shifted operand is more efficient. + // FIXME: Extend this to compare AArch64::BFM and AArch64::ORR with + // right-shifted operand as well. + if (tryOrrWithLeftShift(N, OrOpd0Val, OrOpd1Val, Src, Dst, CurDAG, + BiggerPattern)) + return true; + // both parts match SDLoc DL(N); SDValue Ops[] = {Dst, Src, CurDAG->getTargetConstant(ImmR, DL, VT), diff --git a/llvm/test/CodeGen/AArch64/arm64-bitfield-extract.ll b/llvm/test/CodeGen/AArch64/arm64-bitfield-extract.ll --- a/llvm/test/CodeGen/AArch64/arm64-bitfield-extract.ll +++ b/llvm/test/CodeGen/AArch64/arm64-bitfield-extract.ll @@ -964,9 +964,9 @@ define i16 @test_ignored_rightbits(i32 %dst, i32 %in) { ; LLC-LABEL: test_ignored_rightbits: ; LLC: // %bb.0: -; LLC-NEXT: and w0, w0, #0x7 -; LLC-NEXT: bfi w0, w1, #3, #4 -; LLC-NEXT: bfi w0, w0, #8, #7 +; LLC-NEXT: and w8, w0, #0x7 +; LLC-NEXT: bfi w8, w1, #3, #4 +; LLC-NEXT: orr w0, w8, w8, lsl #8 ; LLC-NEXT: ret ; OPT-LABEL: @test_ignored_rightbits( ; OPT-NEXT: [[POSITIONED_FIELD:%.*]] = shl i32 [[IN:%.*]], 3 @@ -1000,8 +1000,8 @@ ; LLC-NEXT: lsr x8, x0, #47 ; LLC-NEXT: and w9, w1, #0x3 ; LLC-NEXT: bfi w9, w8, #2, #2 -; LLC-NEXT: bfi w9, w9, #4, #4 -; LLC-NEXT: strh w9, [x2] +; LLC-NEXT: orr w8, w9, w9, lsl #4 +; LLC-NEXT: strh w8, [x2] ; LLC-NEXT: .LBB30_2: // %end ; LLC-NEXT: ret ; OPT-LABEL: @sameOperandBFI( diff --git a/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll --- a/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll +++ b/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll @@ -5,8 +5,8 @@ ; CHECK-LABEL: ldi24: ; CHECK: // %bb.0: ; CHECK-NEXT: ldrb w8, [x0, #2] -; CHECK-NEXT: ldrh w0, [x0] -; CHECK-NEXT: bfi w0, w8, #16, #16 +; CHECK-NEXT: ldrh w9, [x0] +; CHECK-NEXT: orr w0, w9, w8, lsl #16 ; CHECK-NEXT: ret %r = load i24, i24* %p ret i24 %r @@ -17,9 +17,9 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldrb w8, [x0, #6] ; CHECK-NEXT: ldrh w9, [x0, #4] -; CHECK-NEXT: ldr w0, [x0] -; CHECK-NEXT: bfi w9, w8, #16, #16 -; CHECK-NEXT: bfi x0, x9, #32, #32 +; CHECK-NEXT: ldr w10, [x0] +; CHECK-NEXT: orr w8, w9, w8, lsl #16 +; CHECK-NEXT: orr x0, x10, x8, lsl #32 ; CHECK-NEXT: ret %r = load i56, i56* %p ret i56 %r @@ -41,10 +41,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldrb w8, [x0, #14] ; CHECK-NEXT: ldrh w9, [x0, #12] -; CHECK-NEXT: ldr w1, [x0, #8] +; CHECK-NEXT: ldr w10, [x0, #8] ; CHECK-NEXT: ldr x0, [x0] -; CHECK-NEXT: bfi w9, w8, #16, #16 -; CHECK-NEXT: bfi x1, x9, #32, #32 +; CHECK-NEXT: orr w8, w9, w8, lsl #16 +; CHECK-NEXT: orr x1, x10, x8, lsl #32 ; CHECK-NEXT: ret %r = load i120, i120* %p ret i120 %r @@ -55,10 +55,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldp x8, x1, [x0] ; CHECK-NEXT: ldrb w9, [x0, #34] -; CHECK-NEXT: ldrh w4, [x0, #32] +; CHECK-NEXT: ldrh w10, [x0, #32] ; CHECK-NEXT: ldp x2, x3, [x0, #16] ; CHECK-NEXT: mov x0, x8 -; CHECK-NEXT: bfi x4, x9, #16, #8 +; CHECK-NEXT: orr x4, x10, x9, lsl #16 ; CHECK-NEXT: ret %r = load i280, i280* %p ret i280 %r @@ -133,7 +133,7 @@ ; CHECK-NEXT: ldrh w10, [x8, #4]! ; CHECK-NEXT: ldrb w11, [x8, #2] ; CHECK-NEXT: orr w9, w9, #0x180 -; CHECK-NEXT: bfi w10, w11, #16, #16 +; CHECK-NEXT: orr w10, w10, w11, lsl #16 ; CHECK-NEXT: str w9, [x0] ; CHECK-NEXT: strb w11, [x8, #2] ; CHECK-NEXT: strh w10, [x8] @@ -153,7 +153,7 @@ ; CHECK-NEXT: ldrb w11, [x8, #2] ; CHECK-NEXT: orr w9, w9, #0x180 ; CHECK-NEXT: and w9, w9, #0xffffff80 -; CHECK-NEXT: bfi w10, w11, #16, #16 +; CHECK-NEXT: orr w10, w10, w11, lsl #16 ; CHECK-NEXT: strb w11, [x8, #2] ; CHECK-NEXT: str w9, [x0] ; CHECK-NEXT: strh w10, [x8] @@ -172,11 +172,11 @@ ; CHECK-NEXT: ldr w11, [x0] ; CHECK-NEXT: ldrh w9, [x8, #4]! ; CHECK-NEXT: ldrb w10, [x8, #2] -; CHECK-NEXT: bfi w9, w10, #16, #8 +; CHECK-NEXT: orr w9, w9, w10, lsl #16 ; CHECK-NEXT: strb w10, [x8, #2] -; CHECK-NEXT: bfi x11, x9, #32, #24 -; CHECK-NEXT: strh w9, [x8] +; CHECK-NEXT: orr x11, x11, x9, lsl #32 ; CHECK-NEXT: and x11, x11, #0xffffffffffffdfff +; CHECK-NEXT: strh w9, [x8] ; CHECK-NEXT: orr w11, w11, w1, lsl #13 ; CHECK-NEXT: str w11, [x0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/arm64-strict-align.ll b/llvm/test/CodeGen/AArch64/arm64-strict-align.ll --- a/llvm/test/CodeGen/AArch64/arm64-strict-align.ll +++ b/llvm/test/CodeGen/AArch64/arm64-strict-align.ll @@ -5,7 +5,7 @@ define i32 @f0(i32* nocapture %p) nounwind { ; CHECK-STRICT: ldrh [[HIGH:w[0-9]+]], [x0, #2] ; CHECK-STRICT: ldrh [[LOW:w[0-9]+]], [x0] -; CHECK-STRICT: bfi [[LOW]], [[HIGH]], #16, #16 +; CHECK-STRICT: orr w0, [[LOW]], [[HIGH]], lsl #16 ; CHECK-STRICT: ret ; CHECK: ldr w0, [x0] @@ -16,7 +16,7 @@ define i64 @f1(i64* nocapture %p) nounwind { ; CHECK-STRICT: ldp w[[LOW:[0-9]+]], w[[HIGH:[0-9]+]], [x0] -; CHECK-STRICT: bfi x[[LOW]], x[[HIGH]], #32, #32 +; CHECK-STRICT: orr x0, x[[LOW]], x[[HIGH]], lsl #32 ; CHECK-STRICT: ret ; CHECK: ldr x0, [x0] diff --git a/llvm/test/CodeGen/AArch64/arm64_32.ll b/llvm/test/CodeGen/AArch64/arm64_32.ll --- a/llvm/test/CodeGen/AArch64/arm64_32.ll +++ b/llvm/test/CodeGen/AArch64/arm64_32.ll @@ -662,8 +662,9 @@ ; CHECK-LABEL: test_struct_hi: ; CHECK: mov w[[IN:[0-9]+]], w0 ; CHECK: bl _get_int -; CHECK-FAST-NEXT: mov w0, w0 -; CHECK-NEXT: bfi x0, x[[IN]], #32, #32 +; CHECK-FAST-NEXT: mov w[[DST:[0-9]+]], w0 +; CHECK-FAST-NEXT: orr x0, x[[DST]], x[[IN]], lsl #32 +; CHECK-OPT-NEXT: bfi x0, x[[IN]], #32, #32 ; CHECK-NEXT: bl _take_pair %val.64 = call i64 @get_int() %val.32 = trunc i64 %val.64 to i32 diff --git a/llvm/test/CodeGen/AArch64/bfis-in-loop.ll b/llvm/test/CodeGen/AArch64/bfis-in-loop.ll --- a/llvm/test/CodeGen/AArch64/bfis-in-loop.ll +++ b/llvm/test/CodeGen/AArch64/bfis-in-loop.ll @@ -28,8 +28,8 @@ ; CHECK-NEXT: ldr x11, [x9, #8] ; CHECK-NEXT: and x9, x10, #0xff ; CHECK-NEXT: and x10, x0, #0xffffffff00000000 -; CHECK-NEXT: bfi x9, x8, #8, #32 -; CHECK-NEXT: bfi x10, x12, #16, #1 +; CHECK-NEXT: orr x9, x9, x8, lsl #8 +; CHECK-NEXT: orr x10, x10, x12, lsl #16 ; CHECK-NEXT: orr x0, x10, x9 ; CHECK-NEXT: ldr x9, [x11, #16] ; CHECK-NEXT: cbnz x11, .LBB0_1 @@ -97,8 +97,8 @@ ; CHECK-NEXT: ldr x11, [x9, #8] ; CHECK-NEXT: and x9, x10, #0xff ; CHECK-NEXT: and x10, x0, #0xffffffff00000000 -; CHECK-NEXT: bfi x9, x8, #8, #32 -; CHECK-NEXT: bfi x10, x12, #16, #1 +; CHECK-NEXT: orr x9, x9, x8, lsl #8 +; CHECK-NEXT: orr x10, x10, x12, lsl #16 ; CHECK-NEXT: orr x0, x10, x9 ; CHECK-NEXT: ldr x9, [x11, #16] ; CHECK-NEXT: cbnz x11, .LBB1_1 diff --git a/llvm/test/CodeGen/AArch64/bitfield-insert.ll b/llvm/test/CodeGen/AArch64/bitfield-insert.ll --- a/llvm/test/CodeGen/AArch64/bitfield-insert.ll +++ b/llvm/test/CodeGen/AArch64/bitfield-insert.ll @@ -269,8 +269,7 @@ ; CHECK-NEXT: lsl w8, w8, #8 ; CHECK-NEXT: mov w9, w8 ; CHECK-NEXT: bfxil w9, w0, #0, #8 -; CHECK-NEXT: bfi w8, w9, #16, #16 -; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: orr w0, w8, w9, lsl #16 ; CHECK-NEXT: ret %conv = zext i8 %a to i32 ; 0 0 0 A %shl = shl i32 %b, 8 ; B2 B1 B0 0 @@ -612,10 +611,9 @@ define i64 @test_orr_not_bfxil_i64(i64 %0) { ; CHECK-LABEL: test_orr_not_bfxil_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: lsr x8, x0, #1 -; CHECK-NEXT: and x8, x8, #0x3f80 -; CHECK-NEXT: bfxil x8, x0, #0, #7 -; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: ubfx x8, x0, #8, #7 +; CHECK-NEXT: and x9, x0, #0x7f +; CHECK-NEXT: orr x0, x9, x8, lsl #7 ; CHECK-NEXT: ret %2 = and i64 %0, 127 %3 = lshr i64 %0, 1 @@ -628,10 +626,9 @@ define i32 @test_orr_not_bfxil_i32(i32 %0) { ; CHECK-LABEL: test_orr_not_bfxil_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: lsr w8, w0, #1 -; CHECK-NEXT: and w8, w8, #0x3f80 -; CHECK-NEXT: bfxil w8, w0, #0, #7 -; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: ubfx w8, w0, #8, #7 +; CHECK-NEXT: and w9, w0, #0x7f +; CHECK-NEXT: orr w0, w9, w8, lsl #7 ; CHECK-NEXT: ret %2 = and i32 %0, 127 %3 = lshr i32 %0, 1 diff --git a/llvm/test/CodeGen/AArch64/build-pair-isel.ll b/llvm/test/CodeGen/AArch64/build-pair-isel.ll --- a/llvm/test/CodeGen/AArch64/build-pair-isel.ll +++ b/llvm/test/CodeGen/AArch64/build-pair-isel.ll @@ -14,7 +14,7 @@ ; CHECK-NEXT: mov w9, w10 ; CHECK-NEXT: mov w8, w8 ; CHECK-NEXT: // kill: def $x8 killed $w8 -; CHECK-NEXT: bfi x8, x9, #32, #32 +; CHECK-NEXT: orr x8, x8, x9, lsl #32 ; CHECK-NEXT: // implicit-def: $x9 ; CHECK-NEXT: str x8, [x9] ; CHECK-NEXT: ret @@ -22,5 +22,3 @@ store i128 %1, i128* undef, align 16 ret void } - - diff --git a/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll b/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll --- a/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll +++ b/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll @@ -19,8 +19,7 @@ ; CHECK-LABEL: rotl_i8_const_shift: ; CHECK: // %bb.0: ; CHECK-NEXT: ubfx w8, w0, #5, #3 -; CHECK-NEXT: bfi w8, w0, #3, #29 -; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: orr w0, w8, w0, lsl #3 ; CHECK-NEXT: ret %f = call i8 @llvm.fshl.i8(i8 %x, i8 %x, i8 3) ret i8 %f diff --git a/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll b/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll --- a/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll +++ b/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll @@ -463,8 +463,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldrb w8, [x0] ; CHECK-NEXT: ldrb w9, [x0, #1] -; CHECK-NEXT: lsl w0, w8, #8 -; CHECK-NEXT: bfi w0, w9, #16, #8 +; CHECK-NEXT: lsl w8, w8, #8 +; CHECK-NEXT: orr w0, w8, w9, lsl #16 ; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 @@ -486,8 +486,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldrb w8, [x0] ; CHECK-NEXT: ldrb w9, [x0, #1] -; CHECK-NEXT: lsl w0, w8, #16 -; CHECK-NEXT: bfi w0, w9, #24, #8 +; CHECK-NEXT: lsl w8, w8, #16 +; CHECK-NEXT: orr w0, w8, w9, lsl #24 ; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 @@ -527,8 +527,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldrb w8, [x0, #1] ; CHECK-NEXT: ldrb w9, [x0] -; CHECK-NEXT: lsl w0, w8, #8 -; CHECK-NEXT: bfi w0, w9, #16, #8 +; CHECK-NEXT: lsl w8, w8, #8 +; CHECK-NEXT: orr w0, w8, w9, lsl #16 ; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 @@ -550,8 +550,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldrb w8, [x0, #1] ; CHECK-NEXT: ldrb w9, [x0] -; CHECK-NEXT: lsl w0, w8, #16 -; CHECK-NEXT: bfi w0, w9, #24, #8 +; CHECK-NEXT: lsl w8, w8, #16 +; CHECK-NEXT: orr w0, w8, w9, lsl #24 ; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 @@ -576,8 +576,8 @@ ; CHECK-LABEL: load_i16_from_nonzero_offset: ; CHECK: // %bb.0: ; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: ldrb w0, [x0, #2] -; CHECK-NEXT: bfi w0, w8, #8, #24 +; CHECK-NEXT: ldrb w9, [x0, #2] +; CHECK-NEXT: orr w0, w9, w8, lsl #8 ; CHECK-NEXT: ret %p1.i16 = bitcast i8* %p to i16* %p2.i8 = getelementptr i8, i8* %p, i64 2 diff --git a/llvm/test/CodeGen/AArch64/load-combine.ll b/llvm/test/CodeGen/AArch64/load-combine.ll --- a/llvm/test/CodeGen/AArch64/load-combine.ll +++ b/llvm/test/CodeGen/AArch64/load-combine.ll @@ -453,8 +453,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldrb w8, [x0] ; CHECK-NEXT: ldrb w9, [x0, #1] -; CHECK-NEXT: lsl w0, w8, #8 -; CHECK-NEXT: bfi w0, w9, #16, #8 +; CHECK-NEXT: lsl w8, w8, #8 +; CHECK-NEXT: orr w0, w8, w9, lsl #16 ; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* @@ -477,8 +477,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldrb w8, [x0] ; CHECK-NEXT: ldrb w9, [x0, #1] -; CHECK-NEXT: lsl w0, w8, #16 -; CHECK-NEXT: bfi w0, w9, #24, #8 +; CHECK-NEXT: lsl w8, w8, #16 +; CHECK-NEXT: orr w0, w8, w9, lsl #24 ; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* @@ -521,8 +521,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldrb w8, [x0, #1] ; CHECK-NEXT: ldrb w9, [x0] -; CHECK-NEXT: lsl w0, w8, #8 -; CHECK-NEXT: bfi w0, w9, #16, #8 +; CHECK-NEXT: lsl w8, w8, #8 +; CHECK-NEXT: orr w0, w8, w9, lsl #16 ; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* @@ -545,8 +545,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldrb w8, [x0, #1] ; CHECK-NEXT: ldrb w9, [x0] -; CHECK-NEXT: lsl w0, w8, #16 -; CHECK-NEXT: bfi w0, w9, #24, #8 +; CHECK-NEXT: lsl w8, w8, #16 +; CHECK-NEXT: orr w0, w8, w9, lsl #24 ; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* @@ -603,7 +603,7 @@ ; CHECK-NEXT: umov w10, v0.h[3] ; CHECK-NEXT: lsl w8, w8, #16 ; CHECK-NEXT: bfi w8, w9, #8, #8 -; CHECK-NEXT: bfi w8, w10, #24, #8 +; CHECK-NEXT: orr w8, w8, w10, lsl #24 ; CHECK-NEXT: str w8, [x1] ; CHECK-NEXT: ret %ld = load <4 x i8>, <4 x i8>* %in, align 4 @@ -634,8 +634,8 @@ ; CHECK-NEXT: ldrh w9, [x0] ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEXT: umov w8, v0.h[2] -; CHECK-NEXT: bfi w9, w8, #16, #8 -; CHECK-NEXT: str w9, [x1] +; CHECK-NEXT: orr w8, w9, w8, lsl #16 +; CHECK-NEXT: str w8, [x1] ; CHECK-NEXT: ret %ld = load <4 x i8>, <4 x i8>* %in, align 4 @@ -665,7 +665,7 @@ ; CHECK-NEXT: umov w8, v0.h[3] ; CHECK-NEXT: umov w9, v0.h[2] ; CHECK-NEXT: lsl w8, w8, #24 -; CHECK-NEXT: bfi w8, w9, #16, #8 +; CHECK-NEXT: orr w8, w8, w9, lsl #16 ; CHECK-NEXT: str w8, [x1] ; CHECK-NEXT: ret %ld = load <4 x i8>, <4 x i8>* %in, align 4 diff --git a/llvm/test/CodeGen/AArch64/logic-shift.ll b/llvm/test/CodeGen/AArch64/logic-shift.ll --- a/llvm/test/CodeGen/AArch64/logic-shift.ll +++ b/llvm/test/CodeGen/AArch64/logic-shift.ll @@ -818,8 +818,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: orr w8, w0, w1 ; CHECK-NEXT: lsr w8, w8, #26 -; CHECK-NEXT: bfi w8, w0, #7, #25 -; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: orr w0, w8, w0, lsl #7 ; CHECK-NEXT: ret %or1 = or i32 %x, %y %sh1 = shl i32 %x, 7 diff --git a/llvm/test/CodeGen/AArch64/nontemporal-load.ll b/llvm/test/CodeGen/AArch64/nontemporal-load.ll --- a/llvm/test/CodeGen/AArch64/nontemporal-load.ll +++ b/llvm/test/CodeGen/AArch64/nontemporal-load.ll @@ -490,27 +490,27 @@ ; ; CHECK-BE-LABEL: test_ldnp_v4i65: ; CHECK-BE: // %bb.0: -; CHECK-BE-NEXT: ldp x9, x8, [x0, #16] -; CHECK-BE-NEXT: ldp x11, x10, [x0] -; CHECK-BE-NEXT: ldrb w7, [x0, #32] -; CHECK-BE-NEXT: lsr x13, x9, #56 -; CHECK-BE-NEXT: lsr x14, x11, #56 -; CHECK-BE-NEXT: extr x15, x10, x9, #56 -; CHECK-BE-NEXT: bfi x7, x8, #8, #56 -; CHECK-BE-NEXT: extr x8, x9, x8, #56 -; CHECK-BE-NEXT: extr x12, x11, x10, #56 -; CHECK-BE-NEXT: lsr x11, x11, #59 -; CHECK-BE-NEXT: ubfx x9, x9, #57, #1 +; CHECK-BE-NEXT: ldp x10, x9, [x0, #16] +; CHECK-BE-NEXT: ldp x12, x11, [x0] +; CHECK-BE-NEXT: ldrb w8, [x0, #32] +; CHECK-BE-NEXT: lsr x13, x10, #56 +; CHECK-BE-NEXT: lsr x14, x12, #56 +; CHECK-BE-NEXT: extr x15, x11, x10, #56 +; CHECK-BE-NEXT: orr x7, x8, x9, lsl #8 +; CHECK-BE-NEXT: extr x8, x10, x9, #56 +; CHECK-BE-NEXT: extr x9, x12, x11, #56 +; CHECK-BE-NEXT: lsr x12, x12, #59 +; CHECK-BE-NEXT: ubfx x10, x10, #57, #1 ; CHECK-BE-NEXT: extr x5, x13, x8, #1 -; CHECK-BE-NEXT: extr x1, x14, x12, #3 -; CHECK-BE-NEXT: ubfx x12, x10, #58, #1 -; CHECK-BE-NEXT: fmov d0, x11 -; CHECK-BE-NEXT: and x11, x8, #0x1 -; CHECK-BE-NEXT: lsr x10, x10, #56 -; CHECK-BE-NEXT: fmov d2, x9 -; CHECK-BE-NEXT: fmov d1, x12 -; CHECK-BE-NEXT: extr x3, x10, x15, #2 -; CHECK-BE-NEXT: fmov d3, x11 +; CHECK-BE-NEXT: extr x1, x14, x9, #3 +; CHECK-BE-NEXT: ubfx x9, x11, #58, #1 +; CHECK-BE-NEXT: fmov d0, x12 +; CHECK-BE-NEXT: and x12, x8, #0x1 +; CHECK-BE-NEXT: lsr x11, x11, #56 +; CHECK-BE-NEXT: fmov d2, x10 +; CHECK-BE-NEXT: fmov d1, x9 +; CHECK-BE-NEXT: extr x3, x11, x15, #2 +; CHECK-BE-NEXT: fmov d3, x12 ; CHECK-BE-NEXT: mov v0.d[1], x1 ; CHECK-BE-NEXT: mov v2.d[1], x5 ; CHECK-BE-NEXT: mov v1.d[1], x3 diff --git a/llvm/test/CodeGen/AArch64/rotate-extract.ll b/llvm/test/CodeGen/AArch64/rotate-extract.ll --- a/llvm/test/CodeGen/AArch64/rotate-extract.ll +++ b/llvm/test/CodeGen/AArch64/rotate-extract.ll @@ -113,8 +113,8 @@ ; CHECK-LABEL: no_extract_mul: ; CHECK: // %bb.0: ; CHECK-NEXT: add x8, x0, x0, lsl #3 -; CHECK-NEXT: lsr x0, x8, #57 -; CHECK-NEXT: bfi x0, x8, #8, #56 +; CHECK-NEXT: lsr x9, x8, #57 +; CHECK-NEXT: orr x0, x9, x8, lsl #8 ; CHECK-NEXT: ret %lhs_mul = mul i64 %i, 2304 %rhs_mul = mul i64 %i, 9 diff --git a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll --- a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll @@ -391,20 +391,20 @@ ; CHECK-NEXT: fmov s0, w10 ; CHECK-NEXT: ubfx x12, x9, #12, #20 ; CHECK-NEXT: lsr x15, x9, #31 -; CHECK-NEXT: bfi w13, w11, #16, #8 -; CHECK-NEXT: lsr x11, x9, #50 +; CHECK-NEXT: orr w11, w13, w11, lsl #16 +; CHECK-NEXT: lsr x13, x9, #50 ; CHECK-NEXT: mov.s v0[1], w14 ; CHECK-NEXT: fmov s1, w12 ; CHECK-NEXT: lsr x12, x10, #38 -; CHECK-NEXT: bfi w11, w13, #14, #18 +; CHECK-NEXT: orr w13, w13, w11, lsl #14 ; CHECK-NEXT: lsr x10, x10, #57 -; CHECK-NEXT: bfi w10, w9, #7, #25 -; CHECK-NEXT: lsr w9, w13, #5 +; CHECK-NEXT: orr w9, w10, w9, lsl #7 +; CHECK-NEXT: lsr w10, w11, #5 ; CHECK-NEXT: mov.s v1[1], w15 ; CHECK-NEXT: mov.s v0[2], w12 -; CHECK-NEXT: mov.s v1[2], w11 -; CHECK-NEXT: mov.s v0[3], w10 -; CHECK-NEXT: mov.s v1[3], w9 +; CHECK-NEXT: mov.s v1[2], w13 +; CHECK-NEXT: mov.s v0[3], w9 +; CHECK-NEXT: mov.s v1[3], w10 ; CHECK-NEXT: uzp1.8h v0, v0, v1 ; CHECK-NEXT: xtn.8b v0, v0 ; CHECK-NEXT: str d0, [x1, x8, lsl #3] @@ -420,21 +420,21 @@ ; CHECK-BE-NEXT: .LBB5_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: ldp x10, x9, [x0] -; CHECK-BE-NEXT: ldrh w15, [x0, #16] -; CHECK-BE-NEXT: lsr x12, x10, #40 +; CHECK-BE-NEXT: ldrh w11, [x0, #16] ; CHECK-BE-NEXT: lsr x13, x10, #45 -; CHECK-BE-NEXT: lsr x11, x9, #40 +; CHECK-BE-NEXT: lsr x15, x10, #40 +; CHECK-BE-NEXT: lsr x12, x9, #40 ; CHECK-BE-NEXT: ubfx x14, x9, #33, #7 ; CHECK-BE-NEXT: ubfx x16, x10, #26, #14 -; CHECK-BE-NEXT: bfi w16, w12, #14, #18 -; CHECK-BE-NEXT: ubfx x12, x9, #14, #18 -; CHECK-BE-NEXT: bfi w14, w11, #7, #24 -; CHECK-BE-NEXT: ldrb w11, [x0, #18] +; CHECK-BE-NEXT: orr w12, w14, w12, lsl #7 +; CHECK-BE-NEXT: ldrb w14, [x0, #18] +; CHECK-BE-NEXT: orr w15, w16, w15, lsl #14 ; CHECK-BE-NEXT: fmov s0, w13 ; CHECK-BE-NEXT: add x0, x0, #32 -; CHECK-BE-NEXT: fmov s1, w14 -; CHECK-BE-NEXT: bfi w11, w15, #8, #16 -; CHECK-BE-NEXT: mov v0.s[1], w16 +; CHECK-BE-NEXT: fmov s1, w12 +; CHECK-BE-NEXT: ubfx x12, x9, #14, #18 +; CHECK-BE-NEXT: orr w11, w14, w11, lsl #8 +; CHECK-BE-NEXT: mov v0.s[1], w15 ; CHECK-BE-NEXT: mov v1.s[1], w12 ; CHECK-BE-NEXT: extr x12, x10, x9, #40 ; CHECK-BE-NEXT: lsl x9, x9, #24 diff --git a/llvm/test/CodeGen/AArch64/urem-seteq.ll b/llvm/test/CodeGen/AArch64/urem-seteq.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq.ll @@ -82,8 +82,8 @@ ; CHECK-NEXT: mul w8, w0, w8 ; CHECK-NEXT: and w9, w8, #0xfffc ; CHECK-NEXT: lsr w9, w9, #1 -; CHECK-NEXT: bfi w9, w8, #15, #17 -; CHECK-NEXT: ubfx w8, w9, #1, #15 +; CHECK-NEXT: orr w8, w9, w8, lsl #15 +; CHECK-NEXT: ubfx w8, w8, #1, #15 ; CHECK-NEXT: cmp w8, #2340 ; CHECK-NEXT: cset w0, hi ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/vec_uaddo.ll b/llvm/test/CodeGen/AArch64/vec_uaddo.ll --- a/llvm/test/CodeGen/AArch64/vec_uaddo.ll +++ b/llvm/test/CodeGen/AArch64/vec_uaddo.ll @@ -249,17 +249,18 @@ ; CHECK-NEXT: and v1.8b, v1.8b, v2.8b ; CHECK-NEXT: and v0.8b, v0.8b, v2.8b ; CHECK-NEXT: add v0.4h, v0.4h, v1.4h -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: umov w9, v0.h[0] +; CHECK-NEXT: umov w8, v0.h[0] +; CHECK-NEXT: umov w9, v0.h[1] ; CHECK-NEXT: umov w10, v0.h[2] ; CHECK-NEXT: umov w11, v0.h[3] ; CHECK-NEXT: and v1.8b, v0.8b, v2.8b ; CHECK-NEXT: cmeq v0.4h, v1.4h, v0.4h -; CHECK-NEXT: bfi w9, w8, #1, #1 -; CHECK-NEXT: bfi w9, w10, #2, #1 +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: bfi w8, w9, #1, #1 ; CHECK-NEXT: mvn v0.8b, v0.8b -; CHECK-NEXT: bfi w9, w11, #3, #29 -; CHECK-NEXT: and w8, w9, #0xf +; CHECK-NEXT: bfi w8, w10, #2, #1 +; CHECK-NEXT: orr w8, w8, w11, lsl #3 +; CHECK-NEXT: and w8, w8, #0xf ; CHECK-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-NEXT: strb w8, [x0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/vec_umulo.ll b/llvm/test/CodeGen/AArch64/vec_umulo.ll --- a/llvm/test/CodeGen/AArch64/vec_umulo.ll +++ b/llvm/test/CodeGen/AArch64/vec_umulo.ll @@ -299,14 +299,15 @@ ; CHECK-NEXT: fmov d2, d0 ; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: and v1.8b, v2.8b, v1.8b -; CHECK-NEXT: umov w8, v1.h[1] -; CHECK-NEXT: umov w9, v1.h[0] +; CHECK-NEXT: umov w8, v1.h[0] +; CHECK-NEXT: umov w9, v1.h[1] ; CHECK-NEXT: umov w10, v1.h[2] ; CHECK-NEXT: umov w11, v1.h[3] -; CHECK-NEXT: bfi w9, w8, #1, #1 -; CHECK-NEXT: bfi w9, w10, #2, #1 -; CHECK-NEXT: bfi w9, w11, #3, #29 -; CHECK-NEXT: and w8, w9, #0xf +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: bfi w8, w9, #1, #1 +; CHECK-NEXT: bfi w8, w10, #2, #1 +; CHECK-NEXT: orr w8, w8, w11, lsl #3 +; CHECK-NEXT: and w8, w8, #0xf ; CHECK-NEXT: strb w8, [x0] ; CHECK-NEXT: ret %t = call {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)