diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -2875,10 +2875,17 @@ return false; } +// Given an 'ISD::OR' node that is going to be selected as BFM, analyze +// the operands and select it to AArch64::ORR with shifted registers if +// that's more efficient. Returns true iff selection to AArch64::ORR happens. static bool tryOrrWithShift(SDNode *N, SDValue OrOpd0, SDValue OrOpd1, SDValue Src, SDValue Dst, SelectionDAG *CurDAG, const bool BiggerPattern) { EVT VT = N->getValueType(0); + assert(N->getOpcode() == ISD::OR && "Expect N to be an OR node"); + assert(((N->getOperand(0) == OrOpd0 && N->getOperand(1) == OrOpd1) || + (N->getOperand(1) == OrOpd0 && N->getOperand(0) == OrOpd1)) && + "Expect OrOpd0 and OrOpd1 to be operands of ISD::OR"); assert((VT == MVT::i32 || VT == MVT::i64) && "Expect result type to be i32 or i64 since N is combinable to BFM"); SDLoc DL(N); @@ -2887,6 +2894,7 @@ if (OrOpd1 != Dst) return false; + const unsigned OrrOpc = (VT == MVT::i32) ? AArch64::ORRWrs : AArch64::ORRXrs; // For "BFM Rd, Rn, #immr, #imms", it's known that BFM simplifies away fewer // nodes from Rn (or inserts additional shift node) if BiggerPattern is true. if (BiggerPattern) { @@ -2903,7 +2911,6 @@ uint64_t EncodedShiftImm; if (isWorthFoldingIntoOrrWithShift(Dst, CurDAG, ShiftedOperand, EncodedShiftImm)) { - unsigned OrrOpc = (VT == MVT::i32) ? AArch64::ORRWrs : AArch64::ORRXrs; SDValue Ops[] = {OrOpd0, ShiftedOperand, CurDAG->getTargetConstant(EncodedShiftImm, DL, VT)}; CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops); @@ -2915,16 +2922,58 @@ assert((!BiggerPattern) && "BiggerPattern should be handled above"); + SDValue Op; uint64_t ShlImm; - if (isOpcWithIntImmediate(OrOpd0.getNode(), ISD::SHL, ShlImm) && - OrOpd0.getOperand(0) == Src && OrOpd0.hasOneUse()) { - unsigned OrrOpc = (VT == MVT::i32) ? AArch64::ORRWrs : AArch64::ORRXrs; - SDValue Ops[] = { - Dst, Src, - CurDAG->getTargetConstant( - AArch64_AM::getShifterImm(AArch64_AM::LSL, ShlImm), DL, VT)}; - CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops); - return true; + if (isOpcWithIntImmediate(OrOpd0.getNode(), ISD::SHL, ShlImm)) { + if (OrOpd0.getOperand(0) == Src && OrOpd0.hasOneUse()) { + SDValue Ops[] = { + Dst, Src, + CurDAG->getTargetConstant( + AArch64_AM::getShifterImm(AArch64_AM::LSL, ShlImm), DL, VT)}; + CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops); + return true; + } + + // Select the following pattern to left-shifted operand rather than BFI. + // %val1 = op .. + // %val2 = shl %val1, #imm + // %res = or %val1, %val2 + // + // If N is selected to be BFI, we know that + // 1) OrOpd0 would be the operand from which extract bits (i.e., folded into + // BFI) 2) OrOpd1 would be the destination operand (i.e., preserved) + // + // Instead of selecting N to BFI, fold OrOpd0 as a left shift directly. + if (OrOpd0.getOperand(0) == OrOpd1) { + SDValue Ops[] = { + OrOpd1, OrOpd1, + CurDAG->getTargetConstant( + AArch64_AM::getShifterImm(AArch64_AM::LSL, ShlImm), DL, VT)}; + CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops); + return true; + } + } + + uint64_t SrlImm; + if (isOpcWithIntImmediate(OrOpd0.getNode(), ISD::SRL, SrlImm)) { + // Select the following pattern to right-shifted operand rather than BFXIL. + // %val1 = op .. + // %val2 = lshr %val1, #imm + // %res = or %val1, %val2 + // + // If N is selected to be BFXIL, we know that + // 1) OrOpd0 would be the operand from which extract bits (i.e., folded into + // BFXIL) 2) OrOpd1 would be the destination operand (i.e., preserved) + // + // Instead of selecting N to BFXIL, fold OrOpd0 as a right shift directly. + if (OrOpd0.getOperand(0) == OrOpd1) { + SDValue Ops[] = { + OrOpd1, OrOpd1, + CurDAG->getTargetConstant( + AArch64_AM::getShifterImm(AArch64_AM::LSR, SrlImm), DL, VT)}; + CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops); + return true; + } } return false; diff --git a/llvm/test/CodeGen/AArch64/bitfield-insert.ll b/llvm/test/CodeGen/AArch64/bitfield-insert.ll --- a/llvm/test/CodeGen/AArch64/bitfield-insert.ll +++ b/llvm/test/CodeGen/AArch64/bitfield-insert.ll @@ -638,13 +638,12 @@ } ; For or operation, one operand is a left shift of another operand. -; Use orr with left-shifted operand is better than bfi. +; So orr with a left-shifted operand is generated (not bfi). define i64 @test_orr_not_bfi_i64(i64 %0) { ; CHECK-LABEL: test_orr_not_bfi_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: and x8, x0, #0xff -; CHECK-NEXT: bfi x8, x0, #8, #8 -; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: orr x0, x8, x8, lsl #8 ; CHECK-NEXT: ret %2 = and i64 %0, 255 %3 = shl i64 %2, 8 @@ -668,14 +667,13 @@ ret i32 %or_res } -; orr is better than bfi, since both simplify away one instruction (%3) +; orr is generated (not bfi), since both simplify away one instruction (%3) ; while orr has shorter latency and higher throughput. define i32 @test_orr_not_bfi_i32(i32 %0) { ; CHECK-LABEL: test_orr_not_bfi_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w0, #0xff -; CHECK-NEXT: bfi w8, w0, #8, #8 -; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: orr w0, w8, w8, lsl #8 ; CHECK-NEXT: ret %2 = and i32 %0, 255 %3 = shl i32 %2, 8 @@ -698,14 +696,13 @@ ret i64 %or_res } -; orr is better than bfxil, since one operand is the right shift of another +; orr is generated (not bfxil), since one operand is the right shift of another ; operand. define i64 @orr_not_bfxil_test2_i64(i64 %0) { ; CHECK-LABEL: orr_not_bfxil_test2_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: and x8, x0, #0xff000 -; CHECK-NEXT: bfxil x8, x0, #12, #8 -; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: orr x0, x8, x8, lsr #12 ; CHECK-NEXT: ret %2 = and i64 %0, 1044480 ; 0xff000 %3 = lshr i64 %2, 12 @@ -729,13 +726,12 @@ ret i32 %or_res } -; one operand is the shift of another operand, so orr is better. +; one operand is the shift of another operand, so orr is generated (not bfxil). define i32 @orr_not_bfxil_test2_i32(i32 %0) { ; CHECK-LABEL: orr_not_bfxil_test2_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w0, #0xff000 -; CHECK-NEXT: bfxil w8, w0, #12, #8 -; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: orr w0, w8, w8, lsr #12 ; CHECK-NEXT: ret %2 = and i32 %0, 1044480 ; 0xff000 %3 = lshr i32 %2, 12