diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp --- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -201,8 +201,22 @@ } } +// These instruction set memory tag and either keep memory contents unchanged or +// set it to zero, ignoring the address part of the source register. +static bool isTagStore(const MachineInstr &MI) { + switch (MI.getOpcode()) { + default: + return false; + case AArch64::STGOffset: + case AArch64::STZGOffset: + case AArch64::ST2GOffset: + case AArch64::STZ2GOffset: + return true; + } +} + // Scaling factor for unscaled load or store. -static int getMemScale(MachineInstr &MI) { +static int getMemScale(const MachineInstr &MI) { switch (MI.getOpcode()) { default: llvm_unreachable("Opcode has unknown scale!"); @@ -255,6 +269,11 @@ case AArch64::STURQi: case AArch64::LDPQi: case AArch64::STPQi: + case AArch64::STGOffset: + case AArch64::STZGOffset: + case AArch64::ST2GOffset: + case AArch64::STZ2GOffset: + case AArch64::STGPi: return 16; } } @@ -449,6 +468,16 @@ return AArch64::STPWpre; case AArch64::STPXi: return AArch64::STPXpre; + case AArch64::STGOffset: + return AArch64::STGPreIndex; + case AArch64::STZGOffset: + return AArch64::STZGPreIndex; + case AArch64::ST2GOffset: + return AArch64::ST2GPreIndex; + case AArch64::STZ2GOffset: + return AArch64::STZ2GPreIndex; + case AArch64::STGPi: + return AArch64::STGPpre; } } @@ -518,6 +547,16 @@ return AArch64::STPWpost; case AArch64::STPXi: return AArch64::STPXpost; + case AArch64::STGOffset: + return AArch64::STGPostIndex; + case AArch64::STZGOffset: + return AArch64::STZGPostIndex; + case AArch64::ST2GOffset: + return AArch64::ST2GPostIndex; + case AArch64::STZ2GOffset: + return AArch64::STZ2GPostIndex; + case AArch64::STGPi: + return AArch64::STGPpost; } } @@ -536,10 +575,30 @@ case AArch64::STPQi: case AArch64::STPWi: case AArch64::STPXi: + case AArch64::STGPi: return true; } } +// Returns the scale and offset range of pre/post indexed variants of MI. +static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale, + int &MinOffset, int &MaxOffset) { + bool IsPaired = isPairedLdSt(MI); + bool IsTagStore = isTagStore(MI); + // ST*G and all paired ldst have the same scale in pre/post-indexed variants + // as in the "unsigned offset" variant. + // All other pre/post indexed ldst instructions are unscaled. + Scale = (IsTagStore || IsPaired) ? getMemScale(MI) : 1; + + if (IsPaired) { + MinOffset = -64; + MaxOffset = 63; + } else { + MinOffset = -256; + MaxOffset = 255; + } +} + static const MachineOperand &getLdStRegOp(const MachineInstr &MI, unsigned PairedRegOp = 0) { assert(PairedRegOp < 2 && "Unexpected register operand idx."); @@ -618,6 +677,11 @@ case AArch64::LDRWui: case AArch64::LDRHHui: case AArch64::LDRBBui: + case AArch64::STGOffset: + case AArch64::STZGOffset: + case AArch64::ST2GOffset: + case AArch64::STZ2GOffset: + case AArch64::STGPi: // Unscaled instructions. case AArch64::STURSi: case AArch64::STURDi: @@ -1328,18 +1392,19 @@ unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode()) : getPostIndexedOpcode(I->getOpcode()); MachineInstrBuilder MIB; + int Scale, MinOffset, MaxOffset; + getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset); if (!isPairedLdSt(*I)) { // Non-paired instruction. MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) .add(getLdStRegOp(*Update)) .add(getLdStRegOp(*I)) .add(getLdStBaseOp(*I)) - .addImm(Value) + .addImm(Value / Scale) .setMemRefs(I->memoperands()) .setMIFlags(I->mergeFlagsWith(*Update)); } else { // Paired instruction. - int Scale = getMemScale(*I); MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) .add(getLdStRegOp(*Update)) .add(getLdStRegOp(*I, 0)) @@ -1395,28 +1460,21 @@ MI.getOperand(1).getReg() != BaseReg) break; - bool IsPairedInsn = isPairedLdSt(MemMI); int UpdateOffset = MI.getOperand(2).getImm(); if (MI.getOpcode() == AArch64::SUBXri) UpdateOffset = -UpdateOffset; - // For non-paired load/store instructions, the immediate must fit in a - // signed 9-bit integer. - if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256)) + // The immediate must be a multiple of the scaling factor of the pre/post + // indexed instruction. + int Scale, MinOffset, MaxOffset; + getPrePostIndexedMemOpInfo(MemMI, Scale, MinOffset, MaxOffset); + if (UpdateOffset % Scale != 0) break; - // For paired load/store instructions, the immediate must be a multiple of - // the scaling factor. The scaled offset must also fit into a signed 7-bit - // integer. - if (IsPairedInsn) { - int Scale = getMemScale(MemMI); - if (UpdateOffset % Scale != 0) - break; - - int ScaledOffset = UpdateOffset / Scale; - if (ScaledOffset > 63 || ScaledOffset < -64) - break; - } + // Scaled offset must fit in the instruction immediate. + int ScaledOffset = UpdateOffset / Scale; + if (ScaledOffset > MaxOffset || ScaledOffset < MinOffset) + break; // If we have a non-zero Offset, we check that it matches the amount // we're adding to the register. @@ -1442,13 +1500,19 @@ if (MIUnscaledOffset != UnscaledOffset) return E; - // If the base register overlaps a destination register, we can't - // merge the update. - bool IsPairedInsn = isPairedLdSt(MemMI); - for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { - Register DestReg = getLdStRegOp(MemMI, i).getReg(); - if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) - return E; + // If the base register overlaps a source/destination register, we can't + // merge the update. This does not apply to tag store instructions which + // ignore the address part of the source register. + // This does not apply to STGPi as well, which does not have unpredictable + // behavior in this case unlike normal stores, and always performs writeback + // after reading the source register value. + if (!isTagStore(MemMI) && MemMI.getOpcode() != AArch64::STGPi) { + bool IsPairedInsn = isPairedLdSt(MemMI); + for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { + Register DestReg = getLdStRegOp(MemMI, i).getReg(); + if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) + return E; + } } // Track which register units have been modified and used between the first @@ -1496,11 +1560,13 @@ return E; // If the base register overlaps a destination register, we can't // merge the update. - bool IsPairedInsn = isPairedLdSt(MemMI); - for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { - Register DestReg = getLdStRegOp(MemMI, i).getReg(); - if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) - return E; + if (!isTagStore(MemMI)) { + bool IsPairedInsn = isPairedLdSt(MemMI); + for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { + Register DestReg = getLdStRegOp(MemMI, i).getReg(); + if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) + return E; + } } // Track which register units have been modified and used between the first @@ -1659,7 +1725,7 @@ // however, is not, so adjust here. int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI); - // Look forward to try to find a post-index instruction. For example, + // Look forward to try to find a pre-index instruction. For example, // ldr x1, [x0, #64] // add x0, x0, #64 // merged into: diff --git a/llvm/test/CodeGen/AArch64/ldst-opt-mte.mir b/llvm/test/CodeGen/AArch64/ldst-opt-mte.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ldst-opt-mte.mir @@ -0,0 +1,285 @@ +# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass aarch64-ldst-opt -verify-machineinstrs -o - %s | FileCheck %s +--- + +### STG and its offset limits + +# CHECK-LABEL: name: test_STG_post +# CHECK: STGPostIndex $x0, $x0, 7 +name: test_STG_post +body: | + bb.0.entry: + liveins: $x0 + + STGOffset $x0, $x0, 0 + $x0 = ADDXri $x0, 112, 0 + RET_ReallyLR implicit $x0 +... + +# CHECK-LABEL: name: test_STG_post_same_reg +# CHECK: STGPostIndex $x1, $x0, 7 +name: test_STG_post_same_reg +body: | + bb.0.entry: + liveins: $x0, $x1 + + STGOffset $x1, $x0, 0 + $x0 = ADDXri $x0, 112, 0 + RET_ReallyLR implicit $x0 +... + +# CHECK-LABEL: name: test_STG_post_unaligned +# CHECK: STGOffset $x0, $x0, 0 +# CHECK-NEXT: ADDXri $x0, 8, 0 +name: test_STG_post_unaligned +body: | + bb.0.entry: + liveins: $x0 + + STGOffset $x0, $x0, 0 + $x0 = ADDXri $x0, 8, 0 + RET_ReallyLR implicit $x0 +... + +# CHECK-LABEL: name: test_STG_post2 +# CHECK: STGPostIndex $x0, $x0, -256 +name: test_STG_post2 +body: | + bb.0.entry: + liveins: $x0 + + STGOffset $x0, $x0, 0 + $x0 = SUBXri $x0, 4096, 0 + RET_ReallyLR implicit $x0 +... + +# CHECK-LABEL: name: test_STG_post3 +# CHECK: STGOffset $x0, $x0, 0 +# CHECK-NEXT: SUBXri $x0, 4112, 0 +name: test_STG_post3 +body: | + bb.0.entry: + liveins: $x0 + + STGOffset $x0, $x0, 0 + $x0 = SUBXri $x0, 4112, 0 + RET_ReallyLR implicit $x0 +... + +# CHECK-LABEL: name: test_STG_post4 +# CHECK: STGPostIndex $x0, $x0, 255 +name: test_STG_post4 +body: | + bb.0.entry: + liveins: $x0 + + STGOffset $x0, $x0, 0 + $x0 = ADDXri $x0, 4080, 0 + RET_ReallyLR implicit $x0 +... + +# CHECK-LABEL: name: test_STG_post5 +# CHECK: STGOffset $x0, $x0, 0 +# CHECK-NEXT: ADDXri $x0, 4096, 0 +name: test_STG_post5 +body: | + bb.0.entry: + liveins: $x0 + + STGOffset $x0, $x0, 0 + $x0 = ADDXri $x0, 4096, 0 + RET_ReallyLR implicit $x0 +... + +### The rest of ST*G variants. + +# CHECK-LABEL: name: test_STZG_post +# CHECK: STZGPostIndex $x0, $x0, 7 +name: test_STZG_post +body: | + bb.0.entry: + liveins: $x0 + + STZGOffset $x0, $x0, 0 + $x0 = ADDXri $x0, 112, 0 + RET_ReallyLR implicit $x0 +... + +# CHECK-LABEL: name: test_ST2G_post +# CHECK: ST2GPostIndex $x0, $x0, 7 +name: test_ST2G_post +body: | + bb.0.entry: + liveins: $x0 + + ST2GOffset $x0, $x0, 0 + $x0 = ADDXri $x0, 112, 0 + RET_ReallyLR implicit $x0 +... + +# CHECK-LABEL: name: test_STZ2G_post +# CHECK: STZ2GPostIndex $x0, $x0, 7 +name: test_STZ2G_post +body: | + bb.0.entry: + liveins: $x0 + + STZ2GOffset $x0, $x0, 0 + $x0 = ADDXri $x0, 112, 0 + RET_ReallyLR implicit $x0 +... + +### STGP and its offset limits + +# CHECK-LABEL: name: test_STGP_post +# CHECK: STGPpost $x1, $x2, $x0, 7 +name: test_STGP_post +body: | + bb.0.entry: + liveins: $x0, $x1, $x2 + + STGPi $x1, $x2, $x0, 0 + $x0 = ADDXri $x0, 112, 0 + RET_ReallyLR implicit $x0 +... + +# CHECK-LABEL: name: test_STGP_post2 +# CHECK: STGPpost $x1, $x2, $x0, -64 +name: test_STGP_post2 +body: | + bb.0.entry: + liveins: $x0, $x1, $x2 + + STGPi $x1, $x2, $x0, 0 + $x0 = SUBXri $x0, 1024, 0 + RET_ReallyLR implicit $x0 +... + +# CHECK-LABEL: name: test_STGP_post3 +# CHECK: STGPi $x1, $x2, $x0, 0 +# CHECK-NEXT: SUBXri $x0, 1040, 0 +name: test_STGP_post3 +body: | + bb.0.entry: + liveins: $x0, $x1, $x2 + + STGPi $x1, $x2, $x0, 0 + $x0 = SUBXri $x0, 1040, 0 + RET_ReallyLR implicit $x0 +... + +# CHECK-LABEL: name: test_STGP_post4 +# CHECK: STGPpost $x1, $x2, $x0, 63 +name: test_STGP_post4 +body: | + bb.0.entry: + liveins: $x0, $x1, $x2 + + STGPi $x1, $x2, $x0, 0 + $x0 = ADDXri $x0, 1008, 0 + RET_ReallyLR implicit $x0 +... + +# CHECK-LABEL: name: test_STGP_post5 +# CHECK: STGPi $x1, $x2, $x0, 0 +# CHECK-NEXT: ADDXri $x0, 1024, 0 +name: test_STGP_post5 +body: | + bb.0.entry: + liveins: $x0, $x1, $x2 + + STGPi $x1, $x2, $x0, 0 + $x0 = ADDXri $x0, 1024, 0 + RET_ReallyLR implicit $x0 +... + +### Pre-indexed forms + +# CHECK-LABEL: name: test_STG_pre +# CHECK: STGPreIndex $x0, $x0, 10 +name: test_STG_pre +body: | + bb.0.entry: + liveins: $x0 + + STGOffset $x0, $x0, 10 + $x0 = ADDXri $x0, 160, 0 + RET_ReallyLR implicit $x0 +... + +# CHECK-LABEL: name: test_STGP_pre +# CHECK: STGPpre $x1, $x2, $x0, 10 +name: test_STGP_pre +body: | + bb.0.entry: + liveins: $x0, $x1, $x2 + + STGPi $x1, $x2, $x0, 10 + $x0 = ADDXri $x0, 160, 0 + RET_ReallyLR implicit $x0 +... + +### Pre-indexed forms with add/sub coming before the store. + +# CHECK-LABEL: name: test_STG_pre_back +# CHECK: STGPreIndex $x0, $x0, 2 +name: test_STG_pre_back +body: | + bb.0.entry: + liveins: $x0 + + $x0 = ADDXri $x0, 32, 0 + STGOffset $x0, $x0, 0 + RET_ReallyLR implicit $x0 +... + +# CHECK-LABEL: name: test_STGP_pre_back +# CHECK: STGPpre $x1, $x2, $x0, -3 +name: test_STGP_pre_back +body: | + bb.0.entry: + liveins: $x0, $x1, $x2 + + $x0 = SUBXri $x0, 48, 0 + STGPi $x1, $x2, $x0, 0 + RET_ReallyLR implicit $x0 +... + +### STGP with source register == address register + +# CHECK-LABEL: name: test_STGP_post_same_reg +# CHECK: STGPpost $x0, $x0, $x0, 7 +name: test_STGP_post_same_reg +body: | + bb.0.entry: + liveins: $x0 + + STGPi $x0, $x0, $x0, 0 + $x0 = ADDXri $x0, 112, 0 + RET_ReallyLR implicit $x0 +... + +# CHECK-LABEL: name: test_STGP_pre_same_reg +# CHECK: STGPpre $x0, $x0, $x0, 7 +name: test_STGP_pre_same_reg +body: | + bb.0.entry: + liveins: $x0 + + STGPi $x0, $x0, $x0, 7 + $x0 = ADDXri $x0, 112, 0 + RET_ReallyLR implicit $x0 +... + +# This case can not be merged because the source register is always read before writeback. +# CHECK-LABEL: name: test_STGP_pre_back_same_reg +# CHECK: SUBXri $x0, 48, 0 +# CHECK-NEXT: STGPi $x0, $x0, $x0, 0 +name: test_STGP_pre_back_same_reg +body: | + bb.0.entry: + liveins: $x0 + + $x0 = SUBXri $x0, 48, 0 + STGPi $x0, $x0, $x0, 0 + RET_ReallyLR implicit $x0 +... diff --git a/llvm/test/CodeGen/AArch64/stgp.ll b/llvm/test/CodeGen/AArch64/stgp.ll --- a/llvm/test/CodeGen/AArch64/stgp.ll +++ b/llvm/test/CodeGen/AArch64/stgp.ll @@ -65,7 +65,7 @@ define void @stgp_alloca(i64 %a, i64 %b) { entry: ; CHECK-LABEL: stgp_alloca: -; CHECK: stgp x0, x1, [sp] +; CHECK: stgp x0, x1, [sp, #-32]! ; CHECK: stgp x1, x0, [sp, #16] ; CHECK: ret %x = alloca i8, i32 32, align 16