diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -199,6 +199,10 @@ // Function alignments. setMinFunctionAlignment(Align(4)); + // Set preferred alignments. + setPrefFunctionAlignment(Subtarget.getPrefFunctionAlignment()); + setPrefLoopAlignment(Subtarget.getPrefLoopAlignment()); + setMaxBytesForAlignment(Subtarget.getMaxBytesForAlignment()); setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::OR); diff --git a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h --- a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h +++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h @@ -52,6 +52,10 @@ LoongArchTargetLowering TLInfo; SelectionDAGTargetInfo TSInfo; + Align PrefFunctionAlignment; + Align PrefLoopAlignment; + unsigned MaxBytesForAlignment; + /// Initializes using the passed in CPU and feature strings so that we can /// use initializer lists for subtarget initialization. LoongArchSubtarget &initializeSubtargetDependencies(const Triple &TT, @@ -60,6 +64,9 @@ StringRef FS, StringRef ABIName); + /// Initialize properties based on the selected processor family. + void initializeProperties(StringRef TuneCPU); + public: // Initializes the data members to match that of the specified triple. LoongArchSubtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU, @@ -97,6 +104,9 @@ unsigned getGRLen() const { return GRLen; } LoongArchABI::ABI getTargetABI() const { return TargetABI; } bool isXRaySupported() const override { return is64Bit(); } + Align getPrefFunctionAlignment() const { return PrefFunctionAlignment; } + Align getPrefLoopAlignment() const { return PrefLoopAlignment; } + unsigned getMaxBytesForAlignment() const { return MaxBytesForAlignment; } }; } // end namespace llvm diff --git a/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp b/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp --- a/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp @@ -35,6 +35,7 @@ TuneCPU = CPU; ParseSubtargetFeatures(CPU, TuneCPU, FS); + initializeProperties(TuneCPU); if (Is64Bit) { GRLenVT = MVT::i64; GRLen = 64; @@ -54,6 +55,32 @@ return *this; } +void LoongArchSubtarget::initializeProperties(StringRef TuneCPU) { + // Initialize CPU specific properties. We should add a tablegen feature for + // this in the future so we can specify it together with the subtarget + // features. + + // TODO: Check TuneCPU and override defaults (that are for LA464) once we + // support optimizing for more uarchs. + + // Default to the alignment settings empirically confirmed to perform best + // on LA464, with 4-wide instruction fetch and decode stages. These settings + // can also be overridden in initializeProperties. + // + // We default to such higher-than-minimum alignments because we assume that: + // + // * these settings should benefit most existing uarchs/users, + // * future general-purpose LoongArch cores are likely to have issue widths + // equal to or wider than 4, + // * instruction sequences best for LA464 should not pessimize other future + // uarchs, and + // * narrower cores would not suffer much (aside from slightly increased + // ICache footprint maybe), compared to the gains everywhere else. + PrefFunctionAlignment = Align(32); + PrefLoopAlignment = Align(16); + MaxBytesForAlignment = 16; +} + LoongArchSubtarget::LoongArchSubtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU, StringRef FS, StringRef ABIName, diff --git a/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll --- a/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll +++ b/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll @@ -13,6 +13,7 @@ ; LA64-NEXT: andi $a0, $a0, 24 ; LA64-NEXT: nor $a4, $a4, $zero ; LA64-NEXT: andi $a1, $a1, 255 +; LA64-NEXT: .p2align 4, , 16 ; LA64-NEXT: .LBB0_1: # %atomicrmw.start ; LA64-NEXT: # =>This Loop Header: Depth=1 ; LA64-NEXT: # Child Loop BB0_3 Depth 2 @@ -66,6 +67,7 @@ ; LA64-NEXT: andi $a0, $a0, 24 ; LA64-NEXT: nor $a4, $a4, $zero ; LA64-NEXT: bstrpick.d $a1, $a1, 15, 0 +; LA64-NEXT: .p2align 4, , 16 ; LA64-NEXT: .LBB1_1: # %atomicrmw.start ; LA64-NEXT: # =>This Loop Header: Depth=1 ; LA64-NEXT: # Child Loop BB1_3 Depth 2 @@ -111,6 +113,7 @@ ; LA64: # %bb.0: ; LA64-NEXT: ld.w $a3, $a0, 0 ; LA64-NEXT: bstrpick.d $a2, $a1, 31, 0 +; LA64-NEXT: .p2align 4, , 16 ; LA64-NEXT: .LBB2_1: # %atomicrmw.start ; LA64-NEXT: # =>This Loop Header: Depth=1 ; LA64-NEXT: # Child Loop BB2_3 Depth 2 @@ -150,6 +153,7 @@ ; LA64-LABEL: atomicrmw_uinc_wrap_i64: ; LA64: # %bb.0: ; LA64-NEXT: ld.d $a2, $a0, 0 +; LA64-NEXT: .p2align 4, , 16 ; LA64-NEXT: .LBB3_1: # %atomicrmw.start ; LA64-NEXT: # =>This Loop Header: Depth=1 ; LA64-NEXT: # Child Loop BB3_3 Depth 2 @@ -195,6 +199,7 @@ ; LA64-NEXT: andi $a0, $a0, 24 ; LA64-NEXT: nor $a4, $a4, $zero ; LA64-NEXT: andi $a5, $a1, 255 +; LA64-NEXT: .p2align 4, , 16 ; LA64-NEXT: .LBB4_1: # %atomicrmw.start ; LA64-NEXT: # =>This Loop Header: Depth=1 ; LA64-NEXT: # Child Loop BB4_3 Depth 2 @@ -253,6 +258,7 @@ ; LA64-NEXT: andi $a0, $a0, 24 ; LA64-NEXT: nor $a4, $a4, $zero ; LA64-NEXT: bstrpick.d $a5, $a1, 15, 0 +; LA64-NEXT: .p2align 4, , 16 ; LA64-NEXT: .LBB5_1: # %atomicrmw.start ; LA64-NEXT: # =>This Loop Header: Depth=1 ; LA64-NEXT: # Child Loop BB5_3 Depth 2 @@ -303,6 +309,7 @@ ; LA64: # %bb.0: ; LA64-NEXT: ld.w $a4, $a0, 0 ; LA64-NEXT: bstrpick.d $a3, $a1, 31, 0 +; LA64-NEXT: .p2align 4, , 16 ; LA64-NEXT: .LBB6_1: # %atomicrmw.start ; LA64-NEXT: # =>This Loop Header: Depth=1 ; LA64-NEXT: # Child Loop BB6_3 Depth 2 @@ -347,6 +354,7 @@ ; LA64-LABEL: atomicrmw_udec_wrap_i64: ; LA64: # %bb.0: ; LA64-NEXT: ld.d $a2, $a0, 0 +; LA64-NEXT: .p2align 4, , 16 ; LA64-NEXT: .LBB7_1: # %atomicrmw.start ; LA64-NEXT: # =>This Loop Header: Depth=1 ; LA64-NEXT: # Child Loop BB7_3 Depth 2 diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll --- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll +++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll @@ -11,6 +11,7 @@ ; LA64F-NEXT: addi.w $a1, $zero, 1 ; LA64F-NEXT: movgr2fr.w $fa1, $a1 ; LA64F-NEXT: ffint.s.w $fa1, $fa1 +; LA64F-NEXT: .p2align 4, , 16 ; LA64F-NEXT: .LBB0_1: # %atomicrmw.start ; LA64F-NEXT: # =>This Loop Header: Depth=1 ; LA64F-NEXT: # Child Loop BB0_3 Depth 2 @@ -46,6 +47,7 @@ ; LA64D-NEXT: addi.w $a1, $zero, 1 ; LA64D-NEXT: movgr2fr.w $fa1, $a1 ; LA64D-NEXT: ffint.s.w $fa1, $fa1 +; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB0_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Loop Header: Depth=1 ; LA64D-NEXT: # Child Loop BB0_3 Depth 2 @@ -85,6 +87,7 @@ ; LA64F-NEXT: pcalau12i $a1, %pc_hi20(.LCPI1_0) ; LA64F-NEXT: addi.d $a1, $a1, %pc_lo12(.LCPI1_0) ; LA64F-NEXT: fld.s $fa1, $a1, 0 +; LA64F-NEXT: .p2align 4, , 16 ; LA64F-NEXT: .LBB1_1: # %atomicrmw.start ; LA64F-NEXT: # =>This Loop Header: Depth=1 ; LA64F-NEXT: # Child Loop BB1_3 Depth 2 @@ -120,6 +123,7 @@ ; LA64D-NEXT: pcalau12i $a1, %pc_hi20(.LCPI1_0) ; LA64D-NEXT: addi.d $a1, $a1, %pc_lo12(.LCPI1_0) ; LA64D-NEXT: fld.s $fa1, $a1, 0 +; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB1_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Loop Header: Depth=1 ; LA64D-NEXT: # Child Loop BB1_3 Depth 2 @@ -159,6 +163,7 @@ ; LA64F-NEXT: addi.w $a1, $zero, 1 ; LA64F-NEXT: movgr2fr.w $fa1, $a1 ; LA64F-NEXT: ffint.s.w $fa1, $fa1 +; LA64F-NEXT: .p2align 4, , 16 ; LA64F-NEXT: .LBB2_1: # %atomicrmw.start ; LA64F-NEXT: # =>This Loop Header: Depth=1 ; LA64F-NEXT: # Child Loop BB2_3 Depth 2 @@ -195,6 +200,7 @@ ; LA64D-NEXT: addi.w $a1, $zero, 1 ; LA64D-NEXT: movgr2fr.w $fa1, $a1 ; LA64D-NEXT: ffint.s.w $fa1, $fa1 +; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB2_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Loop Header: Depth=1 ; LA64D-NEXT: # Child Loop BB2_3 Depth 2 @@ -235,6 +241,7 @@ ; LA64F-NEXT: addi.w $a1, $zero, 1 ; LA64F-NEXT: movgr2fr.w $fa1, $a1 ; LA64F-NEXT: ffint.s.w $fa1, $fa1 +; LA64F-NEXT: .p2align 4, , 16 ; LA64F-NEXT: .LBB3_1: # %atomicrmw.start ; LA64F-NEXT: # =>This Loop Header: Depth=1 ; LA64F-NEXT: # Child Loop BB3_3 Depth 2 @@ -271,6 +278,7 @@ ; LA64D-NEXT: addi.w $a1, $zero, 1 ; LA64D-NEXT: movgr2fr.w $fa1, $a1 ; LA64D-NEXT: ffint.s.w $fa1, $fa1 +; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB3_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Loop Header: Depth=1 ; LA64D-NEXT: # Child Loop BB3_3 Depth 2 @@ -322,6 +330,7 @@ ; LA64F-NEXT: addi.d $s2, $sp, 16 ; LA64F-NEXT: addi.d $s3, $sp, 8 ; LA64F-NEXT: ori $s4, $zero, 2 +; LA64F-NEXT: .p2align 4, , 16 ; LA64F-NEXT: .LBB4_1: # %atomicrmw.start ; LA64F-NEXT: # =>This Inner Loop Header: Depth=1 ; LA64F-NEXT: st.d $a0, $sp, 16 @@ -368,6 +377,7 @@ ; LA64D-NEXT: addi.d $s1, $sp, 16 ; LA64D-NEXT: addi.d $s2, $sp, 8 ; LA64D-NEXT: ori $s3, $zero, 2 +; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB4_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Inner Loop Header: Depth=1 ; LA64D-NEXT: fst.d $fa0, $sp, 16 @@ -414,6 +424,7 @@ ; LA64F-NEXT: addi.d $s2, $sp, 16 ; LA64F-NEXT: addi.d $s3, $sp, 8 ; LA64F-NEXT: ori $s4, $zero, 2 +; LA64F-NEXT: .p2align 4, , 16 ; LA64F-NEXT: .LBB5_1: # %atomicrmw.start ; LA64F-NEXT: # =>This Inner Loop Header: Depth=1 ; LA64F-NEXT: st.d $a0, $sp, 16 @@ -460,6 +471,7 @@ ; LA64D-NEXT: addi.d $s1, $sp, 16 ; LA64D-NEXT: addi.d $s2, $sp, 8 ; LA64D-NEXT: ori $s3, $zero, 2 +; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB5_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Inner Loop Header: Depth=1 ; LA64D-NEXT: fst.d $fa0, $sp, 16 @@ -506,6 +518,7 @@ ; LA64F-NEXT: addi.d $s2, $sp, 16 ; LA64F-NEXT: addi.d $s3, $sp, 8 ; LA64F-NEXT: ori $s4, $zero, 2 +; LA64F-NEXT: .p2align 4, , 16 ; LA64F-NEXT: .LBB6_1: # %atomicrmw.start ; LA64F-NEXT: # =>This Inner Loop Header: Depth=1 ; LA64F-NEXT: st.d $a0, $sp, 16 @@ -552,6 +565,7 @@ ; LA64D-NEXT: addi.d $s1, $sp, 16 ; LA64D-NEXT: addi.d $s2, $sp, 8 ; LA64D-NEXT: ori $s3, $zero, 2 +; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB6_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Inner Loop Header: Depth=1 ; LA64D-NEXT: fst.d $fa0, $sp, 16 @@ -599,6 +613,7 @@ ; LA64F-NEXT: addi.d $s2, $sp, 16 ; LA64F-NEXT: addi.d $s3, $sp, 8 ; LA64F-NEXT: ori $s4, $zero, 2 +; LA64F-NEXT: .p2align 4, , 16 ; LA64F-NEXT: .LBB7_1: # %atomicrmw.start ; LA64F-NEXT: # =>This Inner Loop Header: Depth=1 ; LA64F-NEXT: st.d $a0, $sp, 16 @@ -645,6 +660,7 @@ ; LA64D-NEXT: addi.d $s1, $sp, 16 ; LA64D-NEXT: addi.d $s2, $sp, 8 ; LA64D-NEXT: ori $s3, $zero, 2 +; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB7_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Inner Loop Header: Depth=1 ; LA64D-NEXT: fst.d $fa0, $sp, 16 diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/br.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/br.ll --- a/llvm/test/CodeGen/LoongArch/ir-instruction/br.ll +++ b/llvm/test/CodeGen/LoongArch/ir-instruction/br.ll @@ -5,6 +5,7 @@ define void @foo() noreturn nounwind { ; ALL-LABEL: foo: ; ALL: # %bb.0: # %entry +; ALL-NEXT: .p2align 4, , 16 ; ALL-NEXT: .LBB0_1: # %loop ; ALL-NEXT: # =>This Inner Loop Header: Depth=1 ; ALL-NEXT: b .LBB0_1 diff --git a/llvm/test/CodeGen/LoongArch/preferred-alignments.ll b/llvm/test/CodeGen/LoongArch/preferred-alignments.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/preferred-alignments.ll @@ -0,0 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch64 < %s | FileCheck --check-prefix=LA464 %s +; RUN: llc --mtriple=loongarch64 --mcpu=la464 < %s | FileCheck --check-prefix=LA464 %s + +define signext i32 @sum(ptr noalias nocapture noundef readonly %0, i32 noundef signext %1) { +; LA464-LABEL: sum: +; LA464: # %bb.0: +; LA464-NEXT: ori $a2, $zero, 1 +; LA464-NEXT: blt $a1, $a2, .LBB0_4 +; LA464-NEXT: # %bb.1: +; LA464-NEXT: bstrpick.d $a2, $a1, 31, 0 +; LA464-NEXT: move $a1, $zero +; LA464-NEXT: .p2align 4, , 16 +; LA464-NEXT: .LBB0_2: # =>This Inner Loop Header: Depth=1 +; LA464-NEXT: ld.w $a3, $a0, 0 +; LA464-NEXT: add.d $a1, $a3, $a1 +; LA464-NEXT: addi.d $a0, $a0, 4 +; LA464-NEXT: addi.d $a2, $a2, -1 +; LA464-NEXT: bnez $a2, .LBB0_2 +; LA464-NEXT: # %bb.3: +; LA464-NEXT: addi.w $a0, $a1, 0 +; LA464-NEXT: ret +; LA464-NEXT: .LBB0_4: +; LA464-NEXT: move $a1, $zero +; LA464-NEXT: addi.w $a0, $a1, 0 +; LA464-NEXT: ret + %3 = icmp sgt i32 %1, 0 + br i1 %3, label %4, label %6 + +4: ; preds = %2 + %5 = zext i32 %1 to i64 + br label %8 + +6: ; preds = %8, %2 + %7 = phi i32 [ 0, %2 ], [ %13, %8 ] + ret i32 %7 + +8: ; preds = %4, %8 + %9 = phi i64 [ 0, %4 ], [ %14, %8 ] + %10 = phi i32 [ 0, %4 ], [ %13, %8 ] + %11 = getelementptr inbounds i32, ptr %0, i64 %9 + %12 = load i32, ptr %11, align 4 + %13 = add nsw i32 %12, %10 + %14 = add nuw nsw i64 %9, 1 + %15 = icmp eq i64 %14, %5 + br i1 %15, label %6, label %8 +}