diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -128,7 +128,7 @@ static cl::opt EnableGEPOpt("aarch64-enable-gep-opt", cl::Hidden, cl::desc("Enable optimizations on complex GEPs"), - cl::init(false)); + cl::init(true)); static cl::opt BranchRelaxation("aarch64-enable-branch-relax", cl::Hidden, cl::init(true), @@ -563,17 +563,6 @@ addPass(createFalkorMarkStridedAccessesPass()); } - TargetPassConfig::addIRPasses(); - - addPass(createAArch64StackTaggingPass( - /*IsOptNone=*/TM->getOptLevel() == CodeGenOpt::None)); - - // Match interleaved memory accesses to ldN/stN intrinsics. - if (TM->getOptLevel() != CodeGenOpt::None) { - addPass(createInterleavedLoadCombinePass()); - addPass(createInterleavedAccessPass()); - } - if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) { // Call SeparateConstOffsetFromGEP pass to extract constants within indices // and lower a GEP with multiple indices to either arithmetic operations or @@ -587,6 +576,17 @@ addPass(createLICMPass()); } + TargetPassConfig::addIRPasses(); + + addPass(createAArch64StackTaggingPass( + /*IsOptNone=*/TM->getOptLevel() == CodeGenOpt::None)); + + // Match interleaved memory accesses to ldN/stN intrinsics. + if (TM->getOptLevel() != CodeGenOpt::None) { + addPass(createInterleavedLoadCombinePass()); + addPass(createInterleavedAccessPass()); + } + // Add Control Flow Guard checks. if (TM->getTargetTriple().isOSWindows()) addPass(createCFGuardCheckPass()); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator-gep.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator-gep.ll --- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator-gep.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator-gep.ll @@ -7,45 +7,51 @@ define i32 @cse_gep([4 x i32]* %ptr, i32 %idx) { ; O0-LABEL: name: cse_gep ; O0: bb.1 (%ir-block.0): - ; O0: liveins: $w1, $x0 - ; O0: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 - ; O0: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 - ; O0: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY1]](s32) - ; O0: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; O0: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[SEXT]], [[C]] - ; O0: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[MUL]](s64) - ; O0: [[COPY2:%[0-9]+]]:_(p0) = COPY [[PTR_ADD]](p0) - ; O0: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY2]](p0) :: (load (s32) from %ir.gep1) - ; O0: [[MUL1:%[0-9]+]]:_(s64) = G_MUL [[SEXT]], [[C]] - ; O0: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[MUL1]](s64) - ; O0: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; O0: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C1]](s64) - ; O0: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s32) from %ir.gep2) - ; O0: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD1]], [[LOAD1]] - ; O0: $w0 = COPY [[ADD]](s32) - ; O0: RET_ReallyLR implicit $w0 + ; O0-NEXT: liveins: $w1, $x0 + ; O0-NEXT: {{ $}} + ; O0-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; O0-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; O0-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY1]](s32) + ; O0-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; O0-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[SEXT]], [[C]] + ; O0-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[MUL]](s64) + ; O0-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY [[PTR_ADD]](p0) + ; O0-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY2]](p0) :: (load (s32) from %ir.gep1) + ; O0-NEXT: [[MUL1:%[0-9]+]]:_(s64) = G_MUL [[SEXT]], [[C]] + ; O0-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[MUL1]](s64) + ; O0-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; O0-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C1]](s64) + ; O0-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s32) from %ir.gep2) + ; O0-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD]], [[LOAD1]] + ; O0-NEXT: $w0 = COPY [[ADD]](s32) + ; O0-NEXT: RET_ReallyLR implicit $w0 ; O3-LABEL: name: cse_gep ; O3: bb.1 (%ir-block.0): - ; O3: liveins: $w1, $x0 - ; O3: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 - ; O3: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 - ; O3: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY1]](s32) - ; O3: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; O3: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[SEXT]], [[C]] - ; O3: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[MUL]](s64) - ; O3: [[COPY2:%[0-9]+]]:_(p0) = COPY [[PTR_ADD]](p0) - ; O3: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY2]](p0) :: (load (s32) from %ir.gep1) - ; O3: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; O3: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD]], [[C1]](s64) - ; O3: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s32) from %ir.gep2) - ; O3: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD1]], [[LOAD1]] - ; O3: $w0 = COPY [[ADD]](s32) - ; O3: RET_ReallyLR implicit $w0 + ; O3-NEXT: liveins: $w1, $x0 + ; O3-NEXT: {{ $}} + ; O3-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; O3-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; O3-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; O3-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY1]](s32) + ; O3-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; O3-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[SEXT]], [[C1]] + ; O3-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[MUL]](s64) + ; O3-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY [[PTR_ADD]](p0) + ; O3-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY2]](p0) :: (load (s32) from %ir.gep1) + ; O3-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C]](s64) + ; O3-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[SHL]](s64) + ; O3-NEXT: [[COPY3:%[0-9]+]]:_(p0) = COPY [[PTR_ADD1]](p0) + ; O3-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; O3-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY3]], [[C2]](s64) + ; O3-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s32) from %ir.3) + ; O3-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD]], [[LOAD1]] + ; O3-NEXT: $w0 = COPY [[ADD]](s32) + ; O3-NEXT: RET_ReallyLR implicit $w0 %sidx = sext i32 %idx to i64 %gep1 = getelementptr inbounds [4 x i32], [4 x i32]* %ptr, i64 %sidx, i64 0 %v1 = load i32, i32* %gep1 %gep2 = getelementptr inbounds [4 x i32], [4 x i32]* %ptr, i64 %sidx, i64 1 %v2 = load i32, i32* %gep2 - %res = add i32 %v2, %v2 + %res = add i32 %v1, %v2 ret i32 %res } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll --- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll @@ -1458,10 +1458,12 @@ ; O3-LABEL: name: test_lifetime_intrin ; O3: {{%[0-9]+}}:_(p0) = G_FRAME_INDEX %stack.0.slot ; O3-NEXT: LIFETIME_START %stack.0.slot +; O3-NEXT: G_STORE ; O3-NEXT: LIFETIME_END %stack.0.slot ; O3-NEXT: RET_ReallyLR %slot = alloca i8, i32 4 call void @llvm.lifetime.start.p0i8(i64 0, i8* %slot) + store volatile i8 10, i8* %slot call void @llvm.lifetime.end.p0i8(i64 0, i8* %slot) ret void } diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -33,9 +33,20 @@ ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Loop Data Prefetch ; CHECK-NEXT: Falkor HW Prefetch Fix -; CHECK-NEXT: Module Verifier +; CHECK-NEXT: Split GEPs to a variadic base and a constant offset for better CSE +; CHECK-NEXT: Early CSE ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) +; CHECK-NEXT: Function Alias Analysis Results +; CHECK-NEXT: Memory SSA ; CHECK-NEXT: Canonicalize natural loops +; CHECK-NEXT: LCSSA Verifier +; CHECK-NEXT: Loop-Closed SSA Form Pass +; CHECK-NEXT: Scalar Evolution Analysis +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis +; CHECK-NEXT: Loop Pass Manager +; CHECK-NEXT: Loop Invariant Code Motion +; CHECK-NEXT: Module Verifier ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Canonicalize Freeze Instructions in Loops ; CHECK-NEXT: Induction Variable Users diff --git a/llvm/test/CodeGen/AArch64/cond-br-tuning.ll b/llvm/test/CodeGen/AArch64/cond-br-tuning.ll --- a/llvm/test/CodeGen/AArch64/cond-br-tuning.ll +++ b/llvm/test/CodeGen/AArch64/cond-br-tuning.ll @@ -27,15 +27,16 @@ define void @test_add_cbz_multiple_use(i32 %a, i32 %b, i32* %ptr) { ; CHECK-LABEL: test_add_cbz_multiple_use: ; CHECK: // %bb.0: // %common.ret -; CHECK-NEXT: adds w8, w0, w1 -; CHECK-NEXT: csel w8, wzr, w8, ne +; CHECK-NEXT: mov w8, #10 +; CHECK-NEXT: adds w9, w0, w1 +; CHECK-NEXT: csel w8, w8, w9, ne ; CHECK-NEXT: str w8, [x2] ; CHECK-NEXT: ret %c = add nsw i32 %a, %b %d = icmp ne i32 %c, 0 br i1 %d, label %L1, label %L2 L1: - store i32 0, i32* %ptr, align 4 + store i32 10, i32* %ptr, align 4 ret void L2: store i32 %c, i32* %ptr, align 4 diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AArch64/lit.local.cfg b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AArch64/lit.local.cfg new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AArch64/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'AArch64' in config.root.targets: + config.unsupported = True diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AArch64/split-gep.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AArch64/split-gep.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AArch64/split-gep.ll @@ -0,0 +1,32 @@ +; RUN: llc < %s -O3 -mtriple=aarch64-linux-gnu | FileCheck %s + +%struct = type { i32, i32, i32 } + +define i32 @test1(%struct* %ptr, i64 %idx) { +; CHECK-LABEL: test1: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #12 +; CHECK-NEXT: madd x8, x1, x8, x0 +; CHECK-NEXT: ldr w9, [x8, #4] +; CHECK-NEXT: tbnz w9, #31, .LBB0_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_2: // %then +; CHECK-NEXT: ldr w8, [x8, #8] +; CHECK-NEXT: add w0, w9, w8 +; CHECK-NEXT: ret + %gep.1 = getelementptr %struct, %struct* %ptr, i64 %idx, i32 1 + %lv.1 = load i32, i32* %gep.1 + %c = icmp slt i32 %lv.1, 0 + br i1 %c, label %then, label %else + +then: + %gep.2 = getelementptr %struct, %struct* %ptr, i64 %idx, i32 2 + %lv.2 = load i32, i32* %gep.2 + %res = add i32 %lv.1, %lv.2 + ret i32 %res + +else: + ret i32 0 +}