diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -165,9 +165,6 @@ // Floating point comparison FCMP, - // Scalar extract - EXTR, - // Scalar-to-vector duplication DUP, DUPLANE8, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -556,6 +556,12 @@ setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); + // Lowering Funnel Shifts to EXTR + setOperationAction(ISD::FSHR, MVT::i32, Custom); + setOperationAction(ISD::FSHR, MVT::i64, Custom); + setOperationAction(ISD::FSHL, MVT::i32, Custom); + setOperationAction(ISD::FSHL, MVT::i64, Custom); + if (Subtarget->isTargetWindows()) setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom); else @@ -2351,7 +2357,6 @@ MAKE_CASE(AArch64ISD::BICi) MAKE_CASE(AArch64ISD::ORRi) MAKE_CASE(AArch64ISD::BSP) - MAKE_CASE(AArch64ISD::EXTR) MAKE_CASE(AArch64ISD::ZIP1) MAKE_CASE(AArch64ISD::ZIP2) MAKE_CASE(AArch64ISD::UZP1) @@ -5779,6 +5784,48 @@ return SDValue(); } +// Lower funnel shift right if possible +static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG) { + SDValue Shifts = Op.getOperand(2); + // Check then cast if the number of shifts is a constant + if (auto *ShiftNo = dyn_cast(Shifts)){ + SDLoc dl(Op); + MVT VT = Op.getSimpleValueType(); + + if (Op.getOpcode() == ISD::FSHL){ + long unsigned int newShiftNo = VT.getFixedSizeInBits() - ShiftNo->getZExtValue(); + return DAG.getNode(ISD::FSHR, dl, VT, Op.getOperand(0), Op.getOperand(1), + DAG.getConstant(newShiftNo, dl, Shifts.getValueType())); + } + else if (Op.getOpcode() == ISD::FSHR){ + return Op; + //return DAG.getNode(AArch64ISD::EXTR, dl, VT, Op.getOperand(0), Op.getOperand(1), + // Op.getOperand(2)); + } + } + + return SDValue(); + + // if (Op.getOpCode() == ISD::FSHL){ + // // If number of shift is a constant + // if (isa(ShiftNo)){ + // // Lower it to Funnel Shift Left + // SDLoc dl(Op); + // MVT VT = Op.getSimpleValueType(); + // APInt newShiftNo = VT.getSizeInBits() - ShiftNo.getConstantOperandAPInt(2); + + // return DAG.getNode(ISD::FSHR, dl, VT, Op.getOperand(0), Op.getOperand(1), + // DAG.getConstant(newShiftNo, dl, ShiftNo.getValueType()) + // ); + // } + // // If not expand the node + // return SDValue(); + // } else if (Op.getOpCode() == ISD::FSHL){ + // return (DAG.getNode(ISD::EXTR, dl, VT)) + // } +} + + SDValue AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { LLVM_DEBUG(dbgs() << "Custom lowering: "); @@ -6088,6 +6135,9 @@ return Result; } + case ISD::FSHL: + case ISD::FSHR: + return LowerFunnelShift(Op, DAG); } } @@ -16679,50 +16729,50 @@ return true; } -/// EXTR instruction extracts a contiguous chunk of bits from two existing -/// registers viewed as a high/low pair. This function looks for the pattern: -/// (or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N)) and replaces it -/// with an EXTR. Can't quite be done in TableGen because the two immediates -/// aren't independent. -static SDValue tryCombineToEXTR(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { - SelectionDAG &DAG = DCI.DAG; - SDLoc DL(N); - EVT VT = N->getValueType(0); +// /// EXTR instruction extracts a contiguous chunk of bits from two existing +// /// registers viewed as a high/low pair. This function looks for the pattern: +// /// (or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N)) and replaces it +// /// with an EXTR. Can't quite be done in TableGen because the two immediates +// /// aren't independent. +// static SDValue tryCombineToEXTR(SDNode *N, +// TargetLowering::DAGCombinerInfo &DCI) { +// SelectionDAG &DAG = DCI.DAG; +// SDLoc DL(N); +// EVT VT = N->getValueType(0); - assert(N->getOpcode() == ISD::OR && "Unexpected root"); +// assert(N->getOpcode() == ISD::OR && "Unexpected root"); - if (VT != MVT::i32 && VT != MVT::i64) - return SDValue(); +// if (VT != MVT::i32 && VT != MVT::i64) +// return SDValue(); - SDValue LHS; - uint32_t ShiftLHS = 0; - bool LHSFromHi = false; - if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi)) - return SDValue(); +// SDValue LHS; +// uint32_t ShiftLHS = 0; +// bool LHSFromHi = false; +// if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi)) +// return SDValue(); - SDValue RHS; - uint32_t ShiftRHS = 0; - bool RHSFromHi = false; - if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi)) - return SDValue(); +// SDValue RHS; +// uint32_t ShiftRHS = 0; +// bool RHSFromHi = false; +// if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi)) +// return SDValue(); - // If they're both trying to come from the high part of the register, they're - // not really an EXTR. - if (LHSFromHi == RHSFromHi) - return SDValue(); +// // If they're both trying to come from the high part of the register, they're +// // not really an EXTR. +// if (LHSFromHi == RHSFromHi) +// return SDValue(); - if (ShiftLHS + ShiftRHS != VT.getSizeInBits()) - return SDValue(); +// if (ShiftLHS + ShiftRHS != VT.getSizeInBits()) +// return SDValue(); - if (LHSFromHi) { - std::swap(LHS, RHS); - std::swap(ShiftLHS, ShiftRHS); - } +// if (LHSFromHi) { +// std::swap(LHS, RHS); +// std::swap(ShiftLHS, ShiftRHS); +// } - return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS, - DAG.getConstant(ShiftRHS, DL, MVT::i64)); -} +// return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS, +// DAG.getConstant(ShiftRHS, DL, MVT::i64)); +// } static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64TargetLowering &TLI) { @@ -16909,8 +16959,8 @@ return SDValue(); // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) - if (SDValue Res = tryCombineToEXTR(N, DCI)) - return Res; + //if (SDValue Res = tryCombineToEXTR(N, DCI)) + // return Res; if (SDValue Res = tryCombineToBSL(N, DCI, TLI)) return Res; diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -2950,7 +2950,7 @@ multiclass ExtractImm { def Wrri : BaseExtractImm { + (fshr GPR32:$Rn, GPR32:$Rm, imm0_31:$imm))]> { let Inst{31} = 0; let Inst{22} = 0; // imm<5> must be zero. @@ -2958,7 +2958,7 @@ } def Xrri : BaseExtractImm { + (fshr GPR64:$Rn, GPR64:$Rm, imm0_63:$imm))]> { let Inst{31} = 1; let Inst{22} = 1; diff --git a/llvm/test/CodeGen/AArch64/arm64-long-shift.ll b/llvm/test/CodeGen/AArch64/arm64-long-shift.ll --- a/llvm/test/CodeGen/AArch64/arm64-long-shift.ll +++ b/llvm/test/CodeGen/AArch64/arm64-long-shift.ll @@ -60,7 +60,7 @@ ; CHECK-NEXT: and x10, x2, #0x3f ; CHECK-NEXT: eor x10, x10, #0x3f ; CHECK-NEXT: lsl x9, x9, x10 -; CHECK-NEXT: orr x0, x8, x9 +; CHECK-NEXT: orr x0, x9, x8 ; CHECK-NEXT: asr x1, x1, x2 ; CHECK-NEXT: ret %mask = and i128 %s, 63 @@ -93,7 +93,7 @@ ; CHECK-NEXT: and x10, x2, #0x3f ; CHECK-NEXT: eor x10, x10, #0x3f ; CHECK-NEXT: lsl x9, x9, x10 -; CHECK-NEXT: orr x0, x8, x9 +; CHECK-NEXT: orr x0, x9, x8 ; CHECK-NEXT: lsr x1, x1, x2 ; CHECK-NEXT: ret %mask = and i128 %s, 63 diff --git a/llvm/test/CodeGen/AArch64/icmp-shift-opt.ll b/llvm/test/CodeGen/AArch64/icmp-shift-opt.ll --- a/llvm/test/CodeGen/AArch64/icmp-shift-opt.ll +++ b/llvm/test/CodeGen/AArch64/icmp-shift-opt.ll @@ -137,11 +137,8 @@ ; CHECK-LABEL: opt_setcc_shl_ne_zero_i256: ; CHECK: // %bb.0: ; CHECK-NEXT: orr x8, x2, x0 -; CHECK-NEXT: extr x9, x3, x2, #47 -; CHECK-NEXT: extr x10, x1, x0, #47 -; CHECK-NEXT: extr x8, x8, x1, #47 -; CHECK-NEXT: orr x9, x10, x9 -; CHECK-NEXT: orr x8, x8, x9 +; CHECK-NEXT: orr x8, x1, x8 +; CHECK-NEXT: orr x8, x8, x3, lsl #17 ; CHECK-NEXT: cmp x8, #0 ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/logic-shift.ll b/llvm/test/CodeGen/AArch64/logic-shift.ll --- a/llvm/test/CodeGen/AArch64/logic-shift.ll +++ b/llvm/test/CodeGen/AArch64/logic-shift.ll @@ -690,8 +690,8 @@ define i32 @or_fshl_commute0(i32 %x, i32 %y) { ; CHECK-LABEL: or_fshl_commute0: ; CHECK: // %bb.0: -; CHECK-NEXT: ror w8, w0, #27 -; CHECK-NEXT: orr w0, w8, w1, lsl #5 +; CHECK-NEXT: orr w8, w0, w1 +; CHECK-NEXT: extr w0, w8, w0, #27 ; CHECK-NEXT: ret %or1 = or i32 %x, %y %sh1 = shl i32 %or1, 5 @@ -703,8 +703,8 @@ define i64 @or_fshl_commute1(i64 %x, i64 %y) { ; CHECK-LABEL: or_fshl_commute1: ; CHECK: // %bb.0: -; CHECK-NEXT: ror x8, x0, #29 -; CHECK-NEXT: orr x0, x8, x1, lsl #35 +; CHECK-NEXT: orr w8, w1, w0 +; CHECK-NEXT: extr x0, x8, x0, #29 ; CHECK-NEXT: ret %or1 = or i64 %y, %x %sh1 = shl i64 %or1, 35 @@ -762,8 +762,8 @@ define i64 @or_fshr_commute0(i64 %x, i64 %y) { ; CHECK-LABEL: or_fshr_commute0: ; CHECK: // %bb.0: -; CHECK-NEXT: ror x8, x0, #24 -; CHECK-NEXT: orr x0, x8, x1, lsr #24 +; CHECK-NEXT: orr x8, x0, x1 +; CHECK-NEXT: extr x0, x0, x8, #24 ; CHECK-NEXT: ret %or1 = or i64 %x, %y %sh1 = shl i64 %x, 40 @@ -775,8 +775,8 @@ define i32 @or_fshr_commute1(i32 %x, i32 %y) { ; CHECK-LABEL: or_fshr_commute1: ; CHECK: // %bb.0: -; CHECK-NEXT: ror w8, w0, #29 -; CHECK-NEXT: orr w0, w8, w1, lsr #29 +; CHECK-NEXT: orr w8, w1, w0 +; CHECK-NEXT: extr w0, w0, w8, #29 ; CHECK-NEXT: ret %or1 = or i32 %y, %x %sh1 = shl i32 %x, 3 diff --git a/llvm/test/CodeGen/AArch64/nontemporal-load.ll b/llvm/test/CodeGen/AArch64/nontemporal-load.ll --- a/llvm/test/CodeGen/AArch64/nontemporal-load.ll +++ b/llvm/test/CodeGen/AArch64/nontemporal-load.ll @@ -528,27 +528,25 @@ ; CHECK-LABEL: test_ldnp_v4i63: ; CHECK: ; %bb.0: ; CHECK-NEXT: ldp x8, x9, [x0] -; CHECK-NEXT: ldp x10, x11, [x0, #16] -; CHECK-NEXT: extr x12, x9, x8, #63 +; CHECK-NEXT: ldp x10, x12, [x0, #16] +; CHECK-NEXT: extr x11, x9, x8, #63 ; CHECK-NEXT: and x0, x8, #0x7fffffffffffffff ; CHECK-NEXT: extr x9, x10, x9, #62 -; CHECK-NEXT: extr x10, x11, x10, #61 -; CHECK-NEXT: and x1, x12, #0x7fffffffffffffff +; CHECK-NEXT: extr x3, x12, x10, #61 +; CHECK-NEXT: and x1, x11, #0x7fffffffffffffff ; CHECK-NEXT: and x2, x9, #0x7fffffffffffffff -; CHECK-NEXT: and x3, x10, #0x7fffffffffffffff ; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_ldnp_v4i63: ; CHECK-BE: // %bb.0: -; CHECK-BE-NEXT: ldp x9, x8, [x0] -; CHECK-BE-NEXT: ldp x10, x11, [x0, #16] -; CHECK-BE-NEXT: extr x9, x9, x8, #61 -; CHECK-BE-NEXT: extr x8, x8, x10, #62 -; CHECK-BE-NEXT: extr x10, x10, x11, #63 -; CHECK-BE-NEXT: and x3, x11, #0x7fffffffffffffff -; CHECK-BE-NEXT: and x0, x9, #0x7fffffffffffffff -; CHECK-BE-NEXT: and x1, x8, #0x7fffffffffffffff -; CHECK-BE-NEXT: and x2, x10, #0x7fffffffffffffff +; CHECK-BE-NEXT: ldp x8, x9, [x0, #16] +; CHECK-BE-NEXT: ldp x11, x10, [x0] +; CHECK-BE-NEXT: and x3, x9, #0x7fffffffffffffff +; CHECK-BE-NEXT: extr x12, x10, x8, #62 +; CHECK-BE-NEXT: extr x8, x8, x9, #63 +; CHECK-BE-NEXT: extr x0, x11, x10, #61 +; CHECK-BE-NEXT: and x1, x12, #0x7fffffffffffffff +; CHECK-BE-NEXT: and x2, x8, #0x7fffffffffffffff ; CHECK-BE-NEXT: ret %lv = load <4 x i63>, ptr %A, align 8, !nontemporal !0 ret <4 x i63> %lv diff --git a/llvm/test/CodeGen/AArch64/pr55201.ll b/llvm/test/CodeGen/AArch64/pr55201.ll --- a/llvm/test/CodeGen/AArch64/pr55201.ll +++ b/llvm/test/CodeGen/AArch64/pr55201.ll @@ -4,8 +4,8 @@ define i32 @f(i32 %x) { ; CHECK-LABEL: f: ; CHECK: // %bb.0: -; CHECK-NEXT: ror w8, w0, #27 -; CHECK-NEXT: orr w8, w8, #0x20 +; CHECK-NEXT: orr w8, w0, #0x1 +; CHECK-NEXT: extr w8, w8, w0, #27 ; CHECK-NEXT: and w0, w8, #0xffffffe1 ; CHECK-NEXT: ret %or1 = or i32 %x, 1 diff --git a/llvm/test/CodeGen/AArch64/setcc-fsh.ll b/llvm/test/CodeGen/AArch64/setcc-fsh.ll --- a/llvm/test/CodeGen/AArch64/setcc-fsh.ll +++ b/llvm/test/CodeGen/AArch64/setcc-fsh.ll @@ -224,8 +224,8 @@ define i1 @fshl_or_sgt_0(i32 %x, i32 %y) { ; CHECK-LABEL: fshl_or_sgt_0: ; CHECK: // %bb.0: -; CHECK-NEXT: ror w8, w0, #30 -; CHECK-NEXT: orr w8, w8, w1, lsl #2 +; CHECK-NEXT: orr w8, w0, w1 +; CHECK-NEXT: extr w8, w8, w0, #30 ; CHECK-NEXT: cmp w8, #0 ; CHECK-NEXT: cset w0, gt ; CHECK-NEXT: ret @@ -238,8 +238,8 @@ define i1 @fshl_or_ne_2(i32 %x, i32 %y) { ; CHECK-LABEL: fshl_or_ne_2: ; CHECK: // %bb.0: -; CHECK-NEXT: ror w8, w0, #30 -; CHECK-NEXT: orr w8, w8, w1, lsl #2 +; CHECK-NEXT: orr w8, w0, w1 +; CHECK-NEXT: extr w8, w8, w0, #30 ; CHECK-NEXT: cmp w8, #2 ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll --- a/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll @@ -167,29 +167,28 @@ ; ALL-NEXT: ubfx x12, x9, #3, #5 ; ALL-NEXT: add x8, x8, x12 ; ALL-NEXT: and x9, x9, #0x7 +; ALL-NEXT: mvn w13, w9 ; ALL-NEXT: stp q0, q0, [sp, #32] ; ALL-NEXT: stp x10, x11, [sp, #16] -; ALL-NEXT: eor x11, x9, #0x3f ; ALL-NEXT: str q1, [sp] -; ALL-NEXT: ldp x10, x13, [x8, #8] -; ALL-NEXT: ldr x12, [x8, #24] -; ALL-NEXT: ldr x8, [x8] +; ALL-NEXT: ldp x11, x10, [x8, #8] +; ALL-NEXT: ldr x12, [x8] +; ALL-NEXT: ldr x8, [x8, #24] +; ALL-NEXT: lsr x15, x11, x9 +; ALL-NEXT: lsl x11, x11, #1 ; ALL-NEXT: lsl x14, x10, #1 ; ALL-NEXT: lsr x10, x10, x9 -; ALL-NEXT: lsl x15, x12, #1 -; ALL-NEXT: lsl x14, x14, x11 -; ALL-NEXT: lsl x11, x15, x11 -; ALL-NEXT: mvn w15, w9 -; ALL-NEXT: lsr x8, x8, x9 ; ALL-NEXT: lsr x12, x12, x9 -; ALL-NEXT: lsr x9, x13, x9 -; ALL-NEXT: orr x8, x8, x14 -; ALL-NEXT: orr x9, x9, x11 -; ALL-NEXT: lsl x11, x13, #1 -; ALL-NEXT: lsl x11, x11, x15 -; ALL-NEXT: orr x10, x10, x11 -; ALL-NEXT: stp x9, x12, [x2, #16] -; ALL-NEXT: stp x8, x10, [x2] +; ALL-NEXT: lsr x9, x8, x9 +; ALL-NEXT: lsl x8, x8, #1 +; ALL-NEXT: lsl x11, x11, x13 +; ALL-NEXT: lsl x8, x8, x13 +; ALL-NEXT: orr x11, x11, x12 +; ALL-NEXT: orr x8, x8, x10 +; ALL-NEXT: lsl x10, x14, x13 +; ALL-NEXT: orr x10, x15, x10 +; ALL-NEXT: stp x8, x9, [x2, #16] +; ALL-NEXT: stp x11, x10, [x2] ; ALL-NEXT: add sp, sp, #64 ; ALL-NEXT: ret %src = load i256, ptr %src.ptr, align 1 @@ -212,27 +211,25 @@ ; ALL-NEXT: sub x8, x8, x12 ; ALL-NEXT: and x9, x9, #0x7 ; ALL-NEXT: mvn w12, w9 -; ALL-NEXT: eor x14, x9, #0x3f ; ALL-NEXT: stp q0, q0, [sp] ; ALL-NEXT: stp x10, x11, [sp, #48] ; ALL-NEXT: str q1, [sp, #32] -; ALL-NEXT: ldp x11, x10, [x8, #8] -; ALL-NEXT: ldr x13, [x8] -; ALL-NEXT: ldr x8, [x8, #24] -; ALL-NEXT: lsr x15, x11, #1 -; ALL-NEXT: lsl x11, x11, x9 -; ALL-NEXT: lsr x16, x10, #1 -; ALL-NEXT: lsr x12, x15, x12 -; ALL-NEXT: lsr x15, x13, #1 -; ALL-NEXT: lsr x16, x16, x14 -; ALL-NEXT: lsr x14, x15, x14 -; ALL-NEXT: lsl x13, x13, x9 +; ALL-NEXT: ldp x10, x11, [x8] +; ALL-NEXT: ldp x13, x8, [x8, #16] +; ALL-NEXT: lsr x14, x10, #1 +; ALL-NEXT: lsl x10, x10, x9 +; ALL-NEXT: lsl x15, x11, x9 +; ALL-NEXT: lsr x11, x11, #1 +; ALL-NEXT: lsr x14, x14, x12 +; ALL-NEXT: lsr x11, x11, x12 ; ALL-NEXT: lsl x8, x8, x9 -; ALL-NEXT: lsl x9, x10, x9 -; ALL-NEXT: orr x11, x11, x14 -; ALL-NEXT: orr x8, x8, x16 -; ALL-NEXT: orr x9, x9, x12 -; ALL-NEXT: stp x13, x11, [x2] +; ALL-NEXT: lsl x9, x13, x9 +; ALL-NEXT: lsr x13, x13, #1 +; ALL-NEXT: orr x14, x15, x14 +; ALL-NEXT: lsr x13, x13, x12 +; ALL-NEXT: orr x9, x9, x11 +; ALL-NEXT: orr x8, x8, x13 +; ALL-NEXT: stp x10, x14, [x2] ; ALL-NEXT: stp x9, x8, [x2, #16] ; ALL-NEXT: add sp, sp, #64 ; ALL-NEXT: ret @@ -257,26 +254,24 @@ ; ALL-NEXT: add x8, x8, x10 ; ALL-NEXT: and x9, x9, #0x7 ; ALL-NEXT: stp x12, x12, [sp, #48] -; ALL-NEXT: eor x14, x9, #0x3f ; ALL-NEXT: stp x12, x12, [sp, #32] ; ALL-NEXT: mvn w12, w9 -; ALL-NEXT: ldp x10, x11, [x8, #8] -; ALL-NEXT: ldr x13, [x8, #24] -; ALL-NEXT: ldr x8, [x8] -; ALL-NEXT: lsl x16, x10, #1 +; ALL-NEXT: ldp x10, x11, [x8, #16] +; ALL-NEXT: ldp x8, x13, [x8] +; ALL-NEXT: lsl x14, x10, #1 +; ALL-NEXT: lsr x10, x10, x9 ; ALL-NEXT: lsl x15, x11, #1 -; ALL-NEXT: lsl x16, x16, x14 -; ALL-NEXT: lsl x12, x15, x12 +; ALL-NEXT: asr x11, x11, x9 +; ALL-NEXT: lsl x15, x15, x12 +; ALL-NEXT: lsl x14, x14, x12 +; ALL-NEXT: orr x10, x15, x10 ; ALL-NEXT: lsl x15, x13, #1 -; ALL-NEXT: lsl x14, x15, x14 -; ALL-NEXT: lsr x11, x11, x9 -; ALL-NEXT: asr x13, x13, x9 +; ALL-NEXT: lsl x12, x15, x12 ; ALL-NEXT: lsr x8, x8, x9 -; ALL-NEXT: lsr x9, x10, x9 -; ALL-NEXT: orr x11, x11, x14 -; ALL-NEXT: orr x8, x8, x16 -; ALL-NEXT: orr x9, x9, x12 -; ALL-NEXT: stp x11, x13, [x2, #16] +; ALL-NEXT: lsr x9, x13, x9 +; ALL-NEXT: orr x8, x12, x8 +; ALL-NEXT: orr x9, x9, x14 +; ALL-NEXT: stp x10, x11, [x2, #16] ; ALL-NEXT: stp x8, x9, [x2] ; ALL-NEXT: add sp, sp, #64 ; ALL-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll --- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll @@ -947,27 +947,29 @@ ; CHECK-BE-NEXT: ushll2 v1.4s, v0.8h, #0 ; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-BE-NEXT: mov w9, v1.s[1] -; CHECK-BE-NEXT: mov w11, v0.s[1] -; CHECK-BE-NEXT: mov w13, v1.s[2] -; CHECK-BE-NEXT: fmov w14, s1 +; CHECK-BE-NEXT: mov w11, v1.s[2] +; CHECK-BE-NEXT: fmov w12, s1 +; CHECK-BE-NEXT: mov w13, v0.s[1] ; CHECK-BE-NEXT: mov w15, v0.s[2] -; CHECK-BE-NEXT: fmov w16, s0 ; CHECK-BE-NEXT: mov w10, v1.s[3] +; CHECK-BE-NEXT: mov w14, v0.s[3] ; CHECK-BE-NEXT: lsl x9, x9, #40 -; CHECK-BE-NEXT: mov w12, v0.s[3] -; CHECK-BE-NEXT: lsl x11, x11, #40 -; CHECK-BE-NEXT: orr x9, x9, x14, lsl #60 -; CHECK-BE-NEXT: orr x11, x11, x16, lsl #60 -; CHECK-BE-NEXT: orr x9, x9, x13, lsl #20 -; CHECK-BE-NEXT: orr x11, x11, x15, lsl #20 -; CHECK-BE-NEXT: lsr w13, w14, #4 -; CHECK-BE-NEXT: lsr w14, w16, #4 +; CHECK-BE-NEXT: orr x9, x9, x12, lsl #60 +; CHECK-BE-NEXT: lsr x12, x12, #4 +; CHECK-BE-NEXT: orr x9, x9, x11, lsl #20 +; CHECK-BE-NEXT: fmov w11, s0 +; CHECK-BE-NEXT: lsl x13, x13, #40 +; CHECK-BE-NEXT: lsr x9, x9, #16 +; CHECK-BE-NEXT: bfi x9, x12, #48, #4 ; CHECK-BE-NEXT: strh w10, [x1, #18] -; CHECK-BE-NEXT: extr x9, x13, x9, #16 -; CHECK-BE-NEXT: strh w12, [x1, #8] -; CHECK-BE-NEXT: extr x10, x14, x11, #16 +; CHECK-BE-NEXT: orr x13, x13, x11, lsl #60 +; CHECK-BE-NEXT: lsr x11, x11, #4 +; CHECK-BE-NEXT: orr x13, x13, x15, lsl #20 +; CHECK-BE-NEXT: strh w14, [x1, #8] +; CHECK-BE-NEXT: lsr x12, x13, #16 ; CHECK-BE-NEXT: stur x9, [x1, #10] -; CHECK-BE-NEXT: str x10, [x1], #64 +; CHECK-BE-NEXT: bfi x12, x11, #48, #4 +; CHECK-BE-NEXT: str x12, [x1], #64 ; CHECK-BE-NEXT: b.ne .LBB10_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: ret @@ -2701,29 +2703,29 @@ ; CHECK-BE-NEXT: ushll v2.2d, v1.2s, #0 ; CHECK-BE-NEXT: ushll2 v3.2d, v0.4s, #0 ; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-BE-NEXT: ushll2 v1.2d, v1.4s, #0 ; CHECK-BE-NEXT: mov x9, v3.d[1] +; CHECK-BE-NEXT: ushll2 v1.2d, v1.4s, #0 ; CHECK-BE-NEXT: fmov x10, d3 ; CHECK-BE-NEXT: mov x11, v0.d[1] -; CHECK-BE-NEXT: fmov x12, d0 -; CHECK-BE-NEXT: mov x13, v1.d[1] -; CHECK-BE-NEXT: mov x14, v2.d[1] +; CHECK-BE-NEXT: fmov x13, d0 +; CHECK-BE-NEXT: mov x12, v1.d[1] +; CHECK-BE-NEXT: strb w9, [x1, #32] ; CHECK-BE-NEXT: orr x10, x9, x10, lsl #33 ; CHECK-BE-NEXT: fmov x15, d1 -; CHECK-BE-NEXT: strb w9, [x1, #32] -; CHECK-BE-NEXT: fmov x16, d2 +; CHECK-BE-NEXT: mov x14, v2.d[1] ; CHECK-BE-NEXT: lsl x11, x11, #2 -; CHECK-BE-NEXT: lsl x13, x13, #4 -; CHECK-BE-NEXT: orr x12, x11, x12, lsl #35 -; CHECK-BE-NEXT: lsl x14, x14, #6 -; CHECK-BE-NEXT: orr x15, x13, x15, lsl #37 +; CHECK-BE-NEXT: lsl x12, x12, #4 +; CHECK-BE-NEXT: orr x13, x11, x13, lsl #35 ; CHECK-BE-NEXT: extr x10, x11, x10, #8 -; CHECK-BE-NEXT: orr x11, x14, x16, lsl #39 -; CHECK-BE-NEXT: extr x12, x13, x12, #8 -; CHECK-BE-NEXT: extr x9, x14, x15, #8 -; CHECK-BE-NEXT: extr x11, xzr, x11, #8 -; CHECK-BE-NEXT: stp x12, x10, [x1, #16] -; CHECK-BE-NEXT: stp x11, x9, [x1], #128 +; CHECK-BE-NEXT: fmov x11, d2 +; CHECK-BE-NEXT: orr x15, x12, x15, lsl #37 +; CHECK-BE-NEXT: lsl x14, x14, #6 +; CHECK-BE-NEXT: extr x9, x12, x13, #8 +; CHECK-BE-NEXT: orr x11, x14, x11, lsl #39 +; CHECK-BE-NEXT: extr x12, x14, x15, #8 +; CHECK-BE-NEXT: lsr x11, x11, #8 +; CHECK-BE-NEXT: stp x9, x10, [x1, #16] +; CHECK-BE-NEXT: stp x11, x12, [x1], #128 ; CHECK-BE-NEXT: b.ne .LBB22_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: ret