Index: llvm/trunk/lib/Target/AArch64/AArch64InstructionSelector.cpp =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64InstructionSelector.cpp +++ llvm/trunk/lib/Target/AArch64/AArch64InstructionSelector.cpp @@ -187,7 +187,15 @@ ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const { return selectAddrModeIndexed(Root, Width / 8); } + + bool isWorthFoldingIntoExtendedReg(MachineInstr &MI, + const MachineRegisterInfo &MRI) const; + ComplexRendererFns + selectAddrModeShiftedExtendXReg(MachineOperand &Root, + unsigned SizeInBytes) const; ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const; + ComplexRendererFns selectAddrModeXRO(MachineOperand &Root, + unsigned SizeInBytes) const; void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI) const; @@ -1238,8 +1246,8 @@ if (DstSize != 64) return false; - // Check if we can do any folding from GEPs etc. into the load. - auto ImmFn = selectAddrModeRegisterOffset(I.getOperand(1)); + // Check if we can do any folding from GEPs/shifts etc. into the load. + auto ImmFn = selectAddrModeXRO(I.getOperand(1), MemBytes); if (!ImmFn) return false; @@ -3995,6 +4003,98 @@ }}; } +/// Return true if it is worth folding MI into an extended register. That is, +/// if it's safe to pull it into the addressing mode of a load or store as a +/// shift. +bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg( + MachineInstr &MI, const MachineRegisterInfo &MRI) const { + // Always fold if there is one use, or if we're optimizing for size. + Register DefReg = MI.getOperand(0).getReg(); + if (MRI.hasOneUse(DefReg) || + MI.getParent()->getParent()->getFunction().hasMinSize()) + return true; + + // It's better to avoid folding and recomputing shifts when we don't have a + // fastpath. + if (!STI.hasLSLFast()) + return false; + + // We have a fastpath, so folding a shift in and potentially computing it + // many times may be beneficial. Check if this is only used in memory ops. + // If it is, then we should fold. + return all_of(MRI.use_instructions(DefReg), + [](MachineInstr &Use) { return Use.mayLoadOrStore(); }); +} + +/// This is used for computing addresses like this: +/// +/// ldr x1, [x2, x3, lsl #3] +/// +/// Where x2 is the base register, and x3 is an offset register. The shift-left +/// is a constant value specific to this load instruction. That is, we'll never +/// see anything other than a 3 here (which corresponds to the size of the +/// element being loaded.) +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectAddrModeShiftedExtendXReg( + MachineOperand &Root, unsigned SizeInBytes) const { + if (!Root.isReg()) + return None; + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); + + // Make sure that the memory op is a valid size. + int64_t LegalShiftVal = Log2_32(SizeInBytes); + if (LegalShiftVal == 0) + return None; + + // We want to find something like this: + // + // val = G_CONSTANT LegalShiftVal + // shift = G_SHL off_reg val + // ptr = G_GEP base_reg shift + // x = G_LOAD ptr + // + // And fold it into this addressing mode: + // + // ldr x, [base_reg, off_reg, lsl #LegalShiftVal] + + // Check if we can find the G_GEP. + MachineInstr *Gep = getOpcodeDef(TargetOpcode::G_GEP, Root.getReg(), MRI); + if (!Gep || !isWorthFoldingIntoExtendedReg(*Gep, MRI)) + return None; + + // Now try to match the G_SHL. + MachineInstr *Shl = + getOpcodeDef(TargetOpcode::G_SHL, Gep->getOperand(2).getReg(), MRI); + if (!Shl || !isWorthFoldingIntoExtendedReg(*Shl, MRI)) + return None; + + // Now, try to find the specific G_CONSTANT. + auto ValAndVReg = + getConstantVRegValWithLookThrough(Shl->getOperand(2).getReg(), MRI); + if (!ValAndVReg) + return None; + + // The value must fit into 3 bits, and must be positive. Make sure that is + // true. + int64_t ImmVal = ValAndVReg->Value; + if ((ImmVal & 0x7) != ImmVal) + return None; + + // We are only allowed to shift by LegalShiftVal. This shift value is built + // into the instruction, so we can't just use whatever we want. + if (ImmVal != LegalShiftVal) + return None; + + // We can use the LHS of the GEP as the base, and the LHS of the shift as an + // offset. Signify that we are shifting by setting the shift flag to 1. + return {{ + [=](MachineInstrBuilder &MIB) { MIB.add(Gep->getOperand(1)); }, + [=](MachineInstrBuilder &MIB) { MIB.add(Shl->getOperand(1)); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(1); }, + }}; +} + /// This is used for computing addresses like this: /// /// ldr x1, [x2, x3] @@ -4008,11 +4108,6 @@ MachineOperand &Root) const { MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); - // If we have a constant offset, then we probably don't want to match a - // register offset. - if (isBaseWithConstantOffset(Root, MRI)) - return None; - // We need a GEP. MachineInstr *Gep = MRI.getVRegDef(Root.getReg()); if (!Gep || Gep->getOpcode() != TargetOpcode::G_GEP) @@ -4033,6 +4128,28 @@ }}; } +/// This is intended to be equivalent to selectAddrModeXRO in +/// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root, + unsigned SizeInBytes) const { + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); + + // If we have a constant offset, then we probably don't want to match a + // register offset. + if (isBaseWithConstantOffset(Root, MRI)) + return None; + + // Try to fold shifts into the addressing mode. + auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes); + if (AddrModeFns) + return AddrModeFns; + + // If that doesn't work, see if it's possible to fold in registers from + // a GEP. + return selectAddrModeRegisterOffset(Root); +} + /// Select a "register plus unscaled signed 9-bit immediate" address. This /// should only match when there is an offset that is not valid for a scaled /// immediate addressing mode. The "Size" argument is the size in bytes of the Index: llvm/trunk/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir =================================================================== --- llvm/trunk/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir +++ llvm/trunk/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir @@ -5,6 +5,15 @@ define void @ldrxrox_breg_oreg(i64* %addr) { ret void } define void @ldrdrox_breg_oreg(i64* %addr) { ret void } define void @more_than_one_use(i64* %addr) { ret void } + define void @ldrxrox_shl(i64* %addr) { ret void } + define void @ldrdrox_shl(i64* %addr) { ret void } + define void @more_than_one_use_shl_1(i64* %addr) { ret void } + define void @more_than_one_use_shl_2(i64* %addr) { ret void } + define void @more_than_one_use_shl_lsl_fast(i64* %addr) #1 { ret void } + define void @more_than_one_use_shl_lsl_slow(i64* %addr) { ret void } + define void @more_than_one_use_shl_minsize(i64* %addr) #0 { ret void } + attributes #0 = { optsize minsize } + attributes #1 = { "target-features"="+lsl-fast" } ... --- @@ -88,3 +97,236 @@ %6:gpr(s64) = G_ADD %5, %4 $x0 = COPY %6(s64) RET_ReallyLR implicit $x0 + +... +--- +name: ldrxrox_shl +alignment: 2 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0, $x1, $x2 + ; CHECK-LABEL: name: ldrxrox_shl + ; CHECK: liveins: $x0, $x1, $x2 + ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1 + ; CHECK: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load 8 from %ir.addr) + ; CHECK: $x2 = COPY [[LDRXroX]] + ; CHECK: RET_ReallyLR implicit $x2 + %0:gpr(s64) = COPY $x0 + %1:gpr(s64) = G_CONSTANT i64 3 + %2:gpr(s64) = G_SHL %0, %1(s64) + %3:gpr(p0) = COPY $x1 + %4:gpr(p0) = G_GEP %3, %2 + %5:gpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr) + $x2 = COPY %5(s64) + RET_ReallyLR implicit $x2 + +... +--- +name: ldrdrox_shl +alignment: 2 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0, $x1, $d2 + ; CHECK-LABEL: name: ldrdrox_shl + ; CHECK: liveins: $x0, $x1, $d2 + ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1 + ; CHECK: [[LDRDroX:%[0-9]+]]:fpr64 = LDRDroX [[COPY1]], [[COPY]], 0, 1 :: (load 8 from %ir.addr) + ; CHECK: $d2 = COPY [[LDRDroX]] + ; CHECK: RET_ReallyLR implicit $d2 + %0:gpr(s64) = COPY $x0 + %1:gpr(s64) = G_CONSTANT i64 3 + %2:gpr(s64) = G_SHL %0, %1(s64) + %3:gpr(p0) = COPY $x1 + %4:gpr(p0) = G_GEP %3, %2 + %5:fpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr) + $d2 = COPY %5(s64) + RET_ReallyLR implicit $d2 + +... +--- +name: more_than_one_use_shl_1 +alignment: 2 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + ; Show that we can still fall back to the register-register addressing + ; mode when we fail to pull in the shift. + liveins: $x0, $x1, $x2 + ; CHECK-LABEL: name: more_than_one_use_shl_1 + ; CHECK: liveins: $x0, $x1, $x2 + ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK: [[UBFMXri:%[0-9]+]]:gpr64common = UBFMXri [[COPY]], 61, 60 + ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1 + ; CHECK: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[UBFMXri]], 0, 0 :: (load 8 from %ir.addr) + ; CHECK: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[UBFMXri]], 3, 0 + ; CHECK: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[LDRXroX]], [[ADDXri]] + ; CHECK: $x2 = COPY [[ADDXrr]] + ; CHECK: RET_ReallyLR implicit $x2 + %0:gpr(s64) = COPY $x0 + %1:gpr(s64) = G_CONSTANT i64 3 + %2:gpr(s64) = G_SHL %0, %1(s64) + %3:gpr(p0) = COPY $x1 + %4:gpr(p0) = G_GEP %3, %2 + %5:gpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr) + %6:gpr(s64) = G_ADD %2, %1 + %7:gpr(s64) = G_ADD %5, %6 + $x2 = COPY %7(s64) + RET_ReallyLR implicit $x2 + +... +--- +name: more_than_one_use_shl_2 +alignment: 2 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + ; Show that when the GEP is used outside a memory op, we don't do any + ; folding at all. + liveins: $x0, $x1, $x2 + ; CHECK-LABEL: name: more_than_one_use_shl_2 + ; CHECK: liveins: $x0, $x1, $x2 + ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK: [[UBFMXri:%[0-9]+]]:gpr64common = UBFMXri [[COPY]], 61, 60 + ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1 + ; CHECK: [[ADDXrr:%[0-9]+]]:gpr64common = ADDXrr [[COPY1]], [[UBFMXri]] + ; CHECK: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui [[ADDXrr]], 0 :: (load 8 from %ir.addr) + ; CHECK: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[UBFMXri]], 3, 0 + ; CHECK: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[LDRXui]], [[ADDXri]] + ; CHECK: [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDXrr]] + ; CHECK: [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[COPY2]], [[ADDXrr1]] + ; CHECK: $x2 = COPY [[ADDXrr2]] + ; CHECK: RET_ReallyLR implicit $x2 + %0:gpr(s64) = COPY $x0 + %1:gpr(s64) = G_CONSTANT i64 3 + %2:gpr(s64) = G_SHL %0, %1(s64) + %3:gpr(p0) = COPY $x1 + %4:gpr(p0) = G_GEP %3, %2 + %5:gpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr) + %6:gpr(s64) = G_ADD %2, %1 + %7:gpr(s64) = G_ADD %5, %6 + %8:gpr(s64) = G_PTRTOINT %4 + %9:gpr(s64) = G_ADD %8, %7 + $x2 = COPY %9(s64) + RET_ReallyLR implicit $x2 + +... +--- +name: more_than_one_use_shl_lsl_fast +alignment: 2 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + ; Show that when we have a fastpath for shift-left, we perform the folding + ; if it has more than one use. + liveins: $x0, $x1, $x2 + ; CHECK-LABEL: name: more_than_one_use_shl_lsl_fast + ; CHECK: liveins: $x0, $x1, $x2 + ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1 + ; CHECK: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load 8 from %ir.addr) + ; CHECK: [[LDRXroX1:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load 8 from %ir.addr) + ; CHECK: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[LDRXroX]], [[LDRXroX1]] + ; CHECK: $x2 = COPY [[ADDXrr]] + ; CHECK: RET_ReallyLR implicit $x2 + %0:gpr(s64) = COPY $x0 + %1:gpr(s64) = G_CONSTANT i64 3 + %2:gpr(s64) = G_SHL %0, %1(s64) + %3:gpr(p0) = COPY $x1 + %4:gpr(p0) = G_GEP %3, %2 + %5:gpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr) + %6:gpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr) + %7:gpr(s64) = G_ADD %5, %6 + $x2 = COPY %7(s64) + RET_ReallyLR implicit $x2 + +... +--- +name: more_than_one_use_shl_lsl_slow +alignment: 2 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + ; Show that we don't fold into multiple memory ops when we don't have a + ; fastpath for shift-left. + liveins: $x0, $x1, $x2 + ; CHECK-LABEL: name: more_than_one_use_shl_lsl_slow + ; CHECK: liveins: $x0, $x1, $x2 + ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK: [[UBFMXri:%[0-9]+]]:gpr64 = UBFMXri [[COPY]], 61, 60 + ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1 + ; CHECK: [[ADDXrr:%[0-9]+]]:gpr64common = ADDXrr [[COPY1]], [[UBFMXri]] + ; CHECK: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui [[ADDXrr]], 0 :: (load 8 from %ir.addr) + ; CHECK: [[LDRXui1:%[0-9]+]]:gpr64 = LDRXui [[ADDXrr]], 0 :: (load 8 from %ir.addr) + ; CHECK: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[LDRXui]], [[LDRXui1]] + ; CHECK: $x2 = COPY [[ADDXrr1]] + ; CHECK: RET_ReallyLR implicit $x2 + %0:gpr(s64) = COPY $x0 + %1:gpr(s64) = G_CONSTANT i64 3 + %2:gpr(s64) = G_SHL %0, %1(s64) + %3:gpr(p0) = COPY $x1 + %4:gpr(p0) = G_GEP %3, %2 + %5:gpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr) + %6:gpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr) + %7:gpr(s64) = G_ADD %5, %6 + $x2 = COPY %7(s64) + RET_ReallyLR implicit $x2 + +... +--- +name: more_than_one_use_shl_minsize +alignment: 2 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + ; Show that when we're optimizing for size, we'll do the folding no matter + ; what. + liveins: $x0, $x1, $x2 + ; CHECK-LABEL: name: more_than_one_use_shl_minsize + ; CHECK: liveins: $x0, $x1, $x2 + ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK: [[UBFMXri:%[0-9]+]]:gpr64common = UBFMXri [[COPY]], 61, 60 + ; CHECK: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[COPY1]], [[UBFMXri]] + ; CHECK: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load 8 from %ir.addr) + ; CHECK: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[UBFMXri]], 3, 0 + ; CHECK: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[LDRXroX]], [[ADDXri]] + ; CHECK: [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrr]], [[ADDXrr1]] + ; CHECK: $x2 = COPY [[ADDXrr2]] + ; CHECK: RET_ReallyLR implicit $x2 + %0:gpr(s64) = COPY $x0 + %1:gpr(s64) = G_CONSTANT i64 3 + %2:gpr(s64) = G_SHL %0, %1(s64) + %3:gpr(p0) = COPY $x1 + %4:gpr(p0) = G_GEP %3, %2 + %5:gpr(s64) = G_LOAD %4(p0) :: (load 8 from %ir.addr) + %6:gpr(s64) = G_ADD %2, %1 + %7:gpr(s64) = G_ADD %5, %6 + %8:gpr(s64) = G_PTRTOINT %4 + %9:gpr(s64) = G_ADD %8, %7 + $x2 = COPY %9(s64) + RET_ReallyLR implicit $x2