Index: llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -1550,10 +1550,12 @@ continue; } } - // If the destination register of the loads is the same register, bail - // and keep looking. A load-pair instruction with both destination - // registers the same is UNPREDICTABLE and will result in an exception. - if (MayLoad && Reg == getLdStRegOp(MI).getReg()) { + // If the destination register of one load is the same register or a + // sub/super register of the other load, bail and keep looking. A + // load-pair instruction with both destination registers the same is + // UNPREDICTABLE and will result in an exception. + if (MayLoad && + TRI->isSuperOrSubRegisterEq(Reg, getLdStRegOp(MI).getReg())) { LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI); MemInsns.push_back(&MI); Index: llvm/test/CodeGen/AArch64/aarch64-ldst-subsuperReg-no-ldp.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/aarch64-ldst-subsuperReg-no-ldp.mir @@ -0,0 +1,111 @@ +# RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -run-pass=aarch64-ldst-opt %s -o - | FileCheck %s +# +# The test below test that when the AArch64 Load Store Optimization pass tries to +# convert load instructions into a ldp instruction, and when the destination +# registers are sub/super register of each other, then the convertion should not occur. +# +# For example, for the following pattern: +# ldr x10 [x9] +# ldr w10 [x9, 8], +# We cannot convert it to an ldp instruction. +# +# The pattern we check in this test file is the following pattern in function aarch64-ldst-opt-dup-destReg: +# +# renamable $x10 = LDRSWui renamable $x9, 3 :: (dereferenceable load 4 from `i32* getelementptr inbounds (<{ i16, [2 x i8], i32, i32, [2 x i32] }>, <{ i16, [2 x i8], i32, i32, [2 x i32] }>* @_MergedGlobals, i32 0, i32 4, i64 0)`, !tbaa !0) +# renamable $w10 = LDRWui renamable $x9, 2 :: (dereferenceable load 4 from `i32* getelementptr inbounds (<{ i16, [2 x i8], i32, i32, [2 x i32] }>, <{ i16, [2 x i8], i32, i32, [2 x i32] }>* @_MergedGlobals, i32 0, i32 3)`, !tbaa !0) +# +# These two LDR instructions cannot be converted into an LDP instruction. +# +# CHECK-NOT: LDP +--- | + target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + target triple = "aarch64-unknown-linux-gnu" + + @i = common dso_local local_unnamed_addr global i32 0, align 4 + @g = common dso_local local_unnamed_addr global i32 0, align 4 + @_MergedGlobals = private global <{ i16, [2 x i8], i32, i32, [2 x i32] }> <{ i16 3, [2 x i8] zeroinitializer, i32 3, i32 1, [2 x i32] [i32 1, i32 1] }>, align 4 + + + ; Function Attrs: nofree norecurse nounwind + define dso_local i32 @test() local_unnamed_addr #0 { + entry: + br i1 undef, label %s.exit, label %for.cond1.preheader.preheader.i + + for.cond1.preheader.preheader.i: ; preds = %entry + store i32 7, i32* @i, align 4, !tbaa !0 + br label %s.exit + + s.exit: ; preds = %for.cond1.preheader.preheader.i, %entry + %0 = load i32, i32* getelementptr inbounds (<{ i16, [2 x i8], i32, i32, [2 x i32] }>, <{ i16, [2 x i8], i32, i32, [2 x i32] }>* @_MergedGlobals, i32 0, i32 3), align 4, !tbaa !0 + %dec = add nsw i32 %0, -1 + store i32 %dec, i32* getelementptr inbounds (<{ i16, [2 x i8], i32, i32, [2 x i32] }>, <{ i16, [2 x i8], i32, i32, [2 x i32] }>* @_MergedGlobals, i32 0, i32 3), align 4, !tbaa !0 + store i32 ptrtoint (i32* getelementptr inbounds (<{ i16, [2 x i8], i32, i32, [2 x i32] }>, <{ i16, [2 x i8], i32, i32, [2 x i32] }>* @_MergedGlobals, i32 0, i32 2) to i32), i32* @i, align 4, !tbaa !0 + %conv1.i = sext i8 undef to i16 + store i16 %conv1.i, i16* getelementptr inbounds (<{ i16, [2 x i8], i32, i32, [2 x i32] }>, <{ i16, [2 x i8], i32, i32, [2 x i32] }>* @_MergedGlobals, i32 0, i32 0), align 4, !tbaa !4 + %1 = load i32, i32* getelementptr inbounds (<{ i16, [2 x i8], i32, i32, [2 x i32] }>, <{ i16, [2 x i8], i32, i32, [2 x i32] }>* @_MergedGlobals, i32 0, i32 2), align 4, !tbaa !0 + %cmp.i = icmp sle i32 %1, undef + %conv6.i = zext i1 %cmp.i to i64 + %2 = load i32, i32* getelementptr inbounds (<{ i16, [2 x i8], i32, i32, [2 x i32] }>, <{ i16, [2 x i8], i32, i32, [2 x i32] }>* @_MergedGlobals, i32 0, i32 4, i64 0), align 4, !tbaa !0 + %conv7.i = sext i32 %2 to i64 + %cmp.i.i = icmp eq i32 %2, 0 + br i1 %cmp.i.i, label %r.exit, label %cond.false.i.i + + cond.false.i.i: ; preds = %s.exit + %div.i.i = udiv i64 %conv6.i, %conv7.i + br label %r.exit + + r.exit: ; preds = %cond.false.i.i, %s.exit + store i32 ptrtoint ([2 x i32]* getelementptr inbounds (<{ i16, [2 x i8], i32, i32, [2 x i32] }>, <{ i16, [2 x i8], i32, i32, [2 x i32] }>* @_MergedGlobals, i32 0, i32 4) to i32), i32* @i, align 4, !tbaa !0 + %3 = load i32, i32* getelementptr inbounds (<{ i16, [2 x i8], i32, i32, [2 x i32] }>, <{ i16, [2 x i8], i32, i32, [2 x i32] }>* @_MergedGlobals, i32 0, i32 3), align 4, !tbaa !0 + store i32 %3, i32* @g, align 4, !tbaa !0 + ret i32 undef + } + + attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } + + !0 = !{!1, !1, i64 0} + !1 = !{!"int", !2, i64 0} + !2 = !{!"omnipotent char", !3, i64 0} + !3 = !{!"Simple C/C++ TBAA"} + !4 = !{!5, !5, i64 0} + !5 = !{!"short", !2, i64 0} + +... +--- +name: test +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.2(0x40000000), %bb.1(0x40000000) + + renamable $x8 = ADRP target-flags(aarch64-page) @i + CBNZW $wzr, %bb.2 + + bb.1: + successors: %bb.2(0x80000000) + liveins: $x8 + + renamable $w9 = MOVZWi 7, 0 + STRWui killed renamable $w9, renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @i :: (store 4 into @i, !tbaa !0) + + bb.2: + liveins: $x8 + + $x9 = ADRP target-flags(aarch64-page) @_MergedGlobals + renamable $x9 = ADDXri $x9, target-flags(aarch64-pageoff, aarch64-nc) @_MergedGlobals, 0 + renamable $w10 = LDRWui renamable $x9, 2 :: (dereferenceable load 4 from `i32* getelementptr inbounds (<{ i16, [2 x i8], i32, i32, [2 x i32] }>, <{ i16, [2 x i8], i32, i32, [2 x i32] }>* @_MergedGlobals, i32 0, i32 3)`, !tbaa !0) + $x11 = ORRXrs $xzr, $x9, 0 + renamable $w10 = nsw SUBWri killed renamable $w10, 1, 0 + STRWui killed renamable $w10, renamable $x9, 2 :: (store 4 into `i32* getelementptr inbounds (<{ i16, [2 x i8], i32, i32, [2 x i32] }>, <{ i16, [2 x i8], i32, i32, [2 x i32] }>* @_MergedGlobals, i32 0, i32 3)`, !tbaa !0) + early-clobber renamable $x11, dead $wzr = LDRWpre killed renamable $x11, 4 + renamable $x10 = LDRSWui renamable $x9, 3 :: (dereferenceable load 4 from `i32* getelementptr inbounds (<{ i16, [2 x i8], i32, i32, [2 x i32] }>, <{ i16, [2 x i8], i32, i32, [2 x i32] }>* @_MergedGlobals, i32 0, i32 4, i64 0)`, !tbaa !0) + STRWui renamable $w11, renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @i, implicit killed $x11 :: (store 4 into @i, !tbaa !0) + STRHHui $wzr, renamable $x9, 0 :: (store 2 into `i16* getelementptr inbounds (<{ i16, [2 x i8], i32, i32, [2 x i32] }>, <{ i16, [2 x i8], i32, i32, [2 x i32] }>* @_MergedGlobals, i32 0, i32 0)`, align 4, !tbaa !4) + renamable $w10 = LDRWui renamable $x9, 2 :: (dereferenceable load 4 from `i32* getelementptr inbounds (<{ i16, [2 x i8], i32, i32, [2 x i32] }>, <{ i16, [2 x i8], i32, i32, [2 x i32] }>* @_MergedGlobals, i32 0, i32 3)`, !tbaa !0) + renamable $w9 = ADDWri renamable $w9, 12, 0, implicit killed $x9, implicit-def $x9 + STRWui renamable $w9, killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @i, implicit killed $x9 :: (store 4 into @i, !tbaa !0) + renamable $x8 = ADRP target-flags(aarch64-page) @g + STRWui killed renamable $w10, killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @g :: (store 4 into @g, !tbaa !0) + RET undef $lr, implicit undef $w0 + +...