diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -328,10 +328,6 @@ InstructionCost PPCTTIImpl::getUserCost(const User *U, ArrayRef Operands, TTI::TargetCostKind CostKind) { - // Set the max cost if an MMA type is present (v256i1, v512i1). - if (isMMAType(U->getType())) - return InstructionCost::getMax(); - // We already implement getCastInstrCost and getMemoryOpCost where we perform // the vector adjustment there. if (isa(U) || isa(U) || isa(U)) diff --git a/llvm/test/Transforms/LoopUnroll/PowerPC/p10-respect-unroll-pragma.ll b/llvm/test/Transforms/LoopUnroll/PowerPC/p10-respect-unroll-pragma.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/PowerPC/p10-respect-unroll-pragma.ll @@ -0,0 +1,159 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -passes='default' -mtriple=powerpc64le-- -o - %s | \ +; RUN: FileCheck %s + +%0 = type <{ double }> + +define dso_local void @test(i32* %arg) #0 { +; CHECK-LABEL: @test( +; CHECK-NEXT: bb: +; CHECK-NEXT: br label [[BB16:%.*]] +; CHECK: bb16: +; CHECK-NEXT: [[I20:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* nonnull inttoptr (i64 -32 to i8*)) +; CHECK-NEXT: [[I24:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[I20]]) +; CHECK-NEXT: [[I24_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24]], 0 +; CHECK-NEXT: store <16 x i8> [[I24_ELT]], <16 x i8>* inttoptr (i64 48 to <16 x i8>*), align 16 +; CHECK-NEXT: [[I24_ELT1:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24]], 1 +; CHECK-NEXT: store <16 x i8> [[I24_ELT1]], <16 x i8>* inttoptr (i64 64 to <16 x i8>*), align 64 +; CHECK-NEXT: [[I20_1:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* nonnull inttoptr (i64 -32 to i8*)) +; CHECK-NEXT: [[I24_1:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[I20_1]]) +; CHECK-NEXT: [[I24_ELT_1:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_1]], 0 +; CHECK-NEXT: store <16 x i8> [[I24_ELT_1]], <16 x i8>* inttoptr (i64 48 to <16 x i8>*), align 16 +; CHECK-NEXT: [[I24_ELT1_1:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_1]], 1 +; CHECK-NEXT: store <16 x i8> [[I24_ELT1_1]], <16 x i8>* inttoptr (i64 64 to <16 x i8>*), align 64 +; CHECK-NEXT: [[I20_2:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* nonnull inttoptr (i64 -32 to i8*)) +; CHECK-NEXT: [[I24_2:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[I20_2]]) +; CHECK-NEXT: [[I24_ELT_2:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_2]], 0 +; CHECK-NEXT: store <16 x i8> [[I24_ELT_2]], <16 x i8>* inttoptr (i64 48 to <16 x i8>*), align 16 +; CHECK-NEXT: [[I24_ELT1_2:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_2]], 1 +; CHECK-NEXT: store <16 x i8> [[I24_ELT1_2]], <16 x i8>* inttoptr (i64 64 to <16 x i8>*), align 64 +; CHECK-NEXT: [[I20_3:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* nonnull inttoptr (i64 -32 to i8*)) +; CHECK-NEXT: [[I24_3:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[I20_3]]) +; CHECK-NEXT: [[I24_ELT_3:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_3]], 0 +; CHECK-NEXT: store <16 x i8> [[I24_ELT_3]], <16 x i8>* inttoptr (i64 48 to <16 x i8>*), align 16 +; CHECK-NEXT: [[I24_ELT1_3:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_3]], 1 +; CHECK-NEXT: store <16 x i8> [[I24_ELT1_3]], <16 x i8>* inttoptr (i64 64 to <16 x i8>*), align 64 +; CHECK-NEXT: [[I20_4:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* nonnull inttoptr (i64 -32 to i8*)) +; CHECK-NEXT: [[I24_4:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[I20_4]]) +; CHECK-NEXT: [[I24_ELT_4:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_4]], 0 +; CHECK-NEXT: store <16 x i8> [[I24_ELT_4]], <16 x i8>* inttoptr (i64 48 to <16 x i8>*), align 16 +; CHECK-NEXT: [[I24_ELT1_4:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_4]], 1 +; CHECK-NEXT: store <16 x i8> [[I24_ELT1_4]], <16 x i8>* inttoptr (i64 64 to <16 x i8>*), align 64 +; CHECK-NEXT: [[I20_5:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* nonnull inttoptr (i64 -32 to i8*)) +; CHECK-NEXT: [[I24_5:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[I20_5]]) +; CHECK-NEXT: [[I24_ELT_5:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_5]], 0 +; CHECK-NEXT: store <16 x i8> [[I24_ELT_5]], <16 x i8>* inttoptr (i64 48 to <16 x i8>*), align 16 +; CHECK-NEXT: [[I24_ELT1_5:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_5]], 1 +; CHECK-NEXT: store <16 x i8> [[I24_ELT1_5]], <16 x i8>* inttoptr (i64 64 to <16 x i8>*), align 64 +; CHECK-NEXT: [[I20_6:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* nonnull inttoptr (i64 -32 to i8*)) +; CHECK-NEXT: [[I24_6:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[I20_6]]) +; CHECK-NEXT: [[I24_ELT_6:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_6]], 0 +; CHECK-NEXT: store <16 x i8> [[I24_ELT_6]], <16 x i8>* inttoptr (i64 48 to <16 x i8>*), align 16 +; CHECK-NEXT: [[I24_ELT1_6:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_6]], 1 +; CHECK-NEXT: store <16 x i8> [[I24_ELT1_6]], <16 x i8>* inttoptr (i64 64 to <16 x i8>*), align 64 +; CHECK-NEXT: [[I20_7:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* nonnull inttoptr (i64 -32 to i8*)) +; CHECK-NEXT: [[I24_7:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[I20_7]]) +; CHECK-NEXT: [[I24_ELT_7:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_7]], 0 +; CHECK-NEXT: store <16 x i8> [[I24_ELT_7]], <16 x i8>* inttoptr (i64 48 to <16 x i8>*), align 16 +; CHECK-NEXT: [[I24_ELT1_7:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_7]], 1 +; CHECK-NEXT: store <16 x i8> [[I24_ELT1_7]], <16 x i8>* inttoptr (i64 64 to <16 x i8>*), align 64 +; CHECK-NEXT: [[I20_8:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* nonnull inttoptr (i64 -32 to i8*)) +; CHECK-NEXT: [[I24_8:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[I20_8]]) +; CHECK-NEXT: [[I24_ELT_8:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_8]], 0 +; CHECK-NEXT: store <16 x i8> [[I24_ELT_8]], <16 x i8>* inttoptr (i64 48 to <16 x i8>*), align 16 +; CHECK-NEXT: [[I24_ELT1_8:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_8]], 1 +; CHECK-NEXT: store <16 x i8> [[I24_ELT1_8]], <16 x i8>* inttoptr (i64 64 to <16 x i8>*), align 64 +; CHECK-NEXT: [[I20_9:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* nonnull inttoptr (i64 -32 to i8*)) +; CHECK-NEXT: [[I24_9:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[I20_9]]) +; CHECK-NEXT: [[I24_ELT_9:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_9]], 0 +; CHECK-NEXT: store <16 x i8> [[I24_ELT_9]], <16 x i8>* inttoptr (i64 48 to <16 x i8>*), align 16 +; CHECK-NEXT: [[I24_ELT1_9:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_9]], 1 +; CHECK-NEXT: store <16 x i8> [[I24_ELT1_9]], <16 x i8>* inttoptr (i64 64 to <16 x i8>*), align 64 +; CHECK-NEXT: [[I20_10:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* nonnull inttoptr (i64 -32 to i8*)) +; CHECK-NEXT: [[I24_10:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[I20_10]]) +; CHECK-NEXT: [[I24_ELT_10:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_10]], 0 +; CHECK-NEXT: store <16 x i8> [[I24_ELT_10]], <16 x i8>* inttoptr (i64 48 to <16 x i8>*), align 16 +; CHECK-NEXT: [[I24_ELT1_10:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_10]], 1 +; CHECK-NEXT: store <16 x i8> [[I24_ELT1_10]], <16 x i8>* inttoptr (i64 64 to <16 x i8>*), align 64 +; CHECK-NEXT: [[I20_11:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* nonnull inttoptr (i64 -32 to i8*)) +; CHECK-NEXT: [[I24_11:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[I20_11]]) +; CHECK-NEXT: [[I24_ELT_11:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_11]], 0 +; CHECK-NEXT: store <16 x i8> [[I24_ELT_11]], <16 x i8>* inttoptr (i64 48 to <16 x i8>*), align 16 +; CHECK-NEXT: [[I24_ELT1_11:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_11]], 1 +; CHECK-NEXT: store <16 x i8> [[I24_ELT1_11]], <16 x i8>* inttoptr (i64 64 to <16 x i8>*), align 64 +; CHECK-NEXT: [[I20_12:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* nonnull inttoptr (i64 -32 to i8*)) +; CHECK-NEXT: [[I24_12:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[I20_12]]) +; CHECK-NEXT: [[I24_ELT_12:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_12]], 0 +; CHECK-NEXT: store <16 x i8> [[I24_ELT_12]], <16 x i8>* inttoptr (i64 48 to <16 x i8>*), align 16 +; CHECK-NEXT: [[I24_ELT1_12:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_12]], 1 +; CHECK-NEXT: store <16 x i8> [[I24_ELT1_12]], <16 x i8>* inttoptr (i64 64 to <16 x i8>*), align 64 +; CHECK-NEXT: [[I20_13:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* nonnull inttoptr (i64 -32 to i8*)) +; CHECK-NEXT: [[I24_13:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[I20_13]]) +; CHECK-NEXT: [[I24_ELT_13:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_13]], 0 +; CHECK-NEXT: store <16 x i8> [[I24_ELT_13]], <16 x i8>* inttoptr (i64 48 to <16 x i8>*), align 16 +; CHECK-NEXT: [[I24_ELT1_13:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_13]], 1 +; CHECK-NEXT: store <16 x i8> [[I24_ELT1_13]], <16 x i8>* inttoptr (i64 64 to <16 x i8>*), align 64 +; CHECK-NEXT: [[I20_14:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* nonnull inttoptr (i64 -32 to i8*)) +; CHECK-NEXT: [[I24_14:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[I20_14]]) +; CHECK-NEXT: [[I24_ELT_14:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_14]], 0 +; CHECK-NEXT: store <16 x i8> [[I24_ELT_14]], <16 x i8>* inttoptr (i64 48 to <16 x i8>*), align 16 +; CHECK-NEXT: [[I24_ELT1_14:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_14]], 1 +; CHECK-NEXT: store <16 x i8> [[I24_ELT1_14]], <16 x i8>* inttoptr (i64 64 to <16 x i8>*), align 64 +; CHECK-NEXT: [[I20_15:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* nonnull inttoptr (i64 -32 to i8*)) +; CHECK-NEXT: [[I24_15:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[I20_15]]) +; CHECK-NEXT: [[I24_ELT_15:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_15]], 0 +; CHECK-NEXT: store <16 x i8> [[I24_ELT_15]], <16 x i8>* inttoptr (i64 48 to <16 x i8>*), align 16 +; CHECK-NEXT: [[I24_ELT1_15:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_15]], 1 +; CHECK-NEXT: store <16 x i8> [[I24_ELT1_15]], <16 x i8>* inttoptr (i64 64 to <16 x i8>*), align 64 +; CHECK-NEXT: br label [[BB16]], !llvm.loop [[LOOP0:![0-9]+]] +; +bb: + %i = alloca i32*, align 8 + store i32* %arg, i32** %i, align 8 + %i1 = alloca [0 x %0]*, align 8 + %i2 = alloca double*, align 8 + %i3 = alloca i32, align 4 + %i4 = alloca i32, align 4 + %i5 = alloca i64, align 8 + %i6 = alloca i64, align 8 + %i7 = alloca <256 x i1>, align 32 + %i8 = load i32*, i32** %i, align 8 + %i9 = load i32, i32* %i8, align 4 + %i10 = sub nsw i32 %i9, 0 + store i32 %i10, i32* %i4, align 4 + %i11 = load i32, i32* %i4, align 4 + %i12 = ashr i32 %i11, 5 + %i13 = sext i32 %i12 to i64 + %i14 = load i64, i64* %i6, align 8 + %i15 = sub nsw i64 %i14, 1 + br label %bb16 + +bb16: ; preds = %bb16, %bb + %i17 = load i64, i64* %i5, align 8 + %i18 = icmp sge i64 %i17, 1 + %i19 = getelementptr i8, i8* null, i64 -32 + %i20 = call <256 x i1> @llvm.ppc.vsx.lxvp(i8* %i19) + store <256 x i1> %i20, <256 x i1>* %i7, align 32 + %i21 = getelementptr inbounds i8, i8* null, i64 48 + %i22 = bitcast i8* %i21 to <2 x double>* + %i23 = load <256 x i1>, <256 x i1>* %i7, align 32 + %i24 = call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> %i23) + %i25 = bitcast <2 x double>* %i22 to { <16 x i8>, <16 x i8> }* + store { <16 x i8>, <16 x i8> } %i24, { <16 x i8>, <16 x i8> }* %i25, align 16 + br label %bb16, !llvm.loop !1 +} + +; Function Attrs: argmemonly nounwind readonly +declare <256 x i1> @llvm.ppc.vsx.lxvp(i8*) #1 + +; Function Attrs: nounwind readnone +declare { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1>) #2 + +attributes #0 = { "no-trapping-math"="true" } +attributes #1 = { argmemonly nounwind readonly } +attributes #2 = { nounwind readnone } + +!1 = distinct !{!1, !2, !3, !4} +!2 = !{!"llvm.loop.vectorize.width", i32 1} +!3 = !{!"llvm.loop.interleave.count", i32 1} +!4 = !{!"llvm.loop.unroll.count", i32 16}