diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -196,6 +196,13 @@ (apply [{ applyMutateAnyExtToZExt(*${d}, MRI, B, Observer); }]) >; +def split_store_zero_128 : GICombineRule< + (defs root:$d), + (match (wip_match_opcode G_STORE):$d, + [{ return matchSplitStoreZero128(*${d}, MRI); }]), + (apply [{ applySplitStoreZero128(*${d}, MRI, B, Observer); }]) +>; + // Post-legalization combines which should happen at all optimization levels. // (E.g. ones that facilitate matching for the selector) For example, matching // pseudos. @@ -220,6 +227,7 @@ icmp_to_true_false_known_bits, merge_unmerge, select_combines, fold_merge_to_zext, constant_fold, identity_combines, - ptr_add_immed_chain, overlapping_and]> { + ptr_add_immed_chain, overlapping_and, + split_store_zero_128]> { let DisableRuleOption = "aarch64postlegalizercombiner-disable-rule"; } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp @@ -289,6 +289,44 @@ Observer.changedInstr(MI); } +/// Match a 128b store of zero and split it into two 64 bit stores, for +/// size/performance reasons. +static bool matchSplitStoreZero128(MachineInstr &MI, MachineRegisterInfo &MRI) { + GStore &Store = cast(MI); + if (!Store.isSimple()) + return false; + LLT ValTy = MRI.getType(Store.getValueReg()); + if (!ValTy.isVector() || ValTy.getSizeInBits() != 128) + return false; + if (ValTy.getSizeInBits() != Store.getMemSizeInBits()) + return false; // Don't split truncating stores. + if (!MRI.hasOneNonDBGUse(Store.getValueReg())) + return false; + auto MaybeCst = isConstantOrConstantSplatVector( + *MRI.getVRegDef(Store.getValueReg()), MRI); + return MaybeCst && MaybeCst->isZero(); +} + +static void applySplitStoreZero128(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, + GISelChangeObserver &Observer) { + B.setInstrAndDebugLoc(MI); + GStore &Store = cast(MI); + LLT ValTy = MRI.getType(Store.getValueReg()); + assert(ValTy.isVector() && "Expected a vector store value"); + LLT NewTy = LLT::scalar(64); + Register PtrReg = Store.getPointerReg(); + auto Zero = B.buildConstant(NewTy, 0); + auto HighPtr = B.buildPtrAdd(MRI.getType(PtrReg), PtrReg, + B.buildConstant(LLT::scalar(64), 8)); + auto &MF = *MI.getMF(); + auto *LowMMO = MF.getMachineMemOperand(&Store.getMMO(), 0, NewTy); + auto *HighMMO = MF.getMachineMemOperand(&Store.getMMO(), 8, NewTy); + B.buildStore(Zero, PtrReg, *LowMMO); + B.buildStore(Zero, HighPtr, *HighMMO); + Store.eraseFromParent(); +} + #define AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS #include "AArch64GenPostLegalizeGICombiner.inc" #undef AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-combiner-split-zero-stores.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-combiner-split-zero-stores.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-combiner-split-zero-stores.mir @@ -0,0 +1,200 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple aarch64 -run-pass=aarch64-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s + +... +--- +name: v2s64_split +legalized: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $x0 + + ; Split a store of <2 x i64> into two scalar stores. + + ; CHECK-LABEL: name: v2s64_split + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: %zero:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) + ; CHECK-NEXT: G_STORE %zero(s64), [[COPY]](p0) :: (store (s64), align 16) + ; CHECK-NEXT: G_STORE %zero(s64), [[PTR_ADD]](p0) :: (store (s64) into unknown-address + 8) + ; CHECK-NEXT: RET_ReallyLR + %0:_(p0) = COPY $x0 + %zero:_(s64) = G_CONSTANT i64 0 + %zerovec:_(<2 x s64>) = G_BUILD_VECTOR %zero, %zero + G_STORE %zerovec(<2 x s64>), %0(p0) :: (store (<2 x s64>)) + RET_ReallyLR + +... +--- +name: v4i32_split +legalized: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $x0 + + ; CHECK-LABEL: name: v4i32_split + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; CHECK-NEXT: G_STORE [[C]](s64), [[COPY]](p0) :: (store (s64), align 16) + ; CHECK-NEXT: G_STORE [[C]](s64), [[PTR_ADD]](p0) :: (store (s64) into unknown-address + 8) + ; CHECK-NEXT: RET_ReallyLR + %0:_(p0) = COPY $x0 + %zero:_(s32) = G_CONSTANT i32 0 + %zerovec:_(<4 x s32>) = G_BUILD_VECTOR %zero, %zero, %zero, %zero + G_STORE %zerovec(<4 x s32>), %0(p0) :: (store (<4 x s32>)) + RET_ReallyLR + +... +--- +name: v8i16_split +legalized: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $x0 + + ; CHECK-LABEL: name: v8i16_split + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; CHECK-NEXT: G_STORE [[C]](s64), [[COPY]](p0) :: (store (s64), align 16) + ; CHECK-NEXT: G_STORE [[C]](s64), [[PTR_ADD]](p0) :: (store (s64) into unknown-address + 8) + ; CHECK-NEXT: RET_ReallyLR + %0:_(p0) = COPY $x0 + %zero:_(s16) = G_CONSTANT i16 0 + %zerovec:_(<8 x s16>) = G_BUILD_VECTOR %zero, %zero, %zero, %zero, %zero, %zero, %zero, %zero + G_STORE %zerovec(<8 x s16>), %0(p0) :: (store (<8 x s16>)) + RET_ReallyLR + +... + +# Negative tests +--- +name: v2i32_nosplit +legalized: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $x0 + + ; CHECK-LABEL: name: v2i32_nosplit + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: %zerovec:_(<2 x s32>) = G_BUILD_VECTOR %zero(s32), %zero(s32) + ; CHECK-NEXT: G_STORE %zerovec(<2 x s32>), [[COPY]](p0) :: (store (<2 x s32>)) + ; CHECK-NEXT: RET_ReallyLR + %0:_(p0) = COPY $x0 + %zero:_(s32) = G_CONSTANT i32 0 + %zerovec:_(<2 x s32>) = G_BUILD_VECTOR %zero, %zero + G_STORE %zerovec(<2 x s32>), %0(p0) :: (store (<2 x s32>)) + RET_ReallyLR + +... +--- +name: multiple_uses +legalized: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $x0 + + ; CHECK-LABEL: name: multiple_uses + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: %zero:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: %zerovec:_(<2 x s64>) = G_BUILD_VECTOR %zero(s64), %zero(s64) + ; CHECK-NEXT: G_STORE %zerovec(<2 x s64>), [[COPY]](p0) :: (store (<2 x s64>)) + ; CHECK-NEXT: $q0 = COPY %zerovec(<2 x s64>) + ; CHECK-NEXT: RET_ReallyLR + %0:_(p0) = COPY $x0 + %zero:_(s64) = G_CONSTANT i64 0 + %zerovec:_(<2 x s64>) = G_BUILD_VECTOR %zero, %zero + G_STORE %zerovec(<2 x s64>), %0(p0) :: (store (<2 x s64>)) + $q0 = COPY %zerovec + RET_ReallyLR + +... +--- +name: truncating +legalized: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $x0 + + ; CHECK-LABEL: name: truncating + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: %zero:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: %zerovec:_(<2 x s64>) = G_BUILD_VECTOR %zero(s64), %zero(s64) + ; CHECK-NEXT: G_STORE %zerovec(<2 x s64>), [[COPY]](p0) :: (store (<2 x s32>)) + ; CHECK-NEXT: RET_ReallyLR + %0:_(p0) = COPY $x0 + %zero:_(s64) = G_CONSTANT i64 0 + %zerovec:_(<2 x s64>) = G_BUILD_VECTOR %zero, %zero + G_STORE %zerovec(<2 x s64>), %0(p0) :: (store (<2 x s32>)) + RET_ReallyLR + +... +--- +name: volatile +legalized: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $x0 + + ; CHECK-LABEL: name: volatile + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: %zero:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: %zerovec:_(<2 x s64>) = G_BUILD_VECTOR %zero(s64), %zero(s64) + ; CHECK-NEXT: G_STORE %zerovec(<2 x s64>), [[COPY]](p0) :: (volatile store (<4 x s32>)) + ; CHECK-NEXT: RET_ReallyLR + %0:_(p0) = COPY $x0 + %zero:_(s64) = G_CONSTANT i64 0 + %zerovec:_(<2 x s64>) = G_BUILD_VECTOR %zero, %zero + G_STORE %zerovec(<2 x s64>), %0(p0) :: (volatile store (<4 x s32>)) + RET_ReallyLR + +... +--- +name: s128_scalar +legalized: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $x0 + + ; Split a store of <2 x i64> into two scalar stores. + + ; CHECK-LABEL: name: s128_scalar + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: %zero:_(s128) = G_CONSTANT i128 0 + ; CHECK-NEXT: G_STORE %zero(s128), [[COPY]](p0) :: (store (s128)) + ; CHECK-NEXT: RET_ReallyLR + %0:_(p0) = COPY $x0 + %zero:_(s128) = G_CONSTANT i128 0 + G_STORE %zero(s128), %0(p0) :: (store (s128)) + RET_ReallyLR + +...