diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -644,8 +644,20 @@ } return Cost; - case ISD::ADD: case ISD::MUL: + if (LT.second != MVT::v2i64) + return (Cost + 1) * LT.first; + // Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive + // as elements are extracted from the vectors and the muls scalarized. + // As getScalarizationOverhead is a bit too pessimistic, we estimate the + // cost for a i64 vector directly here, which is: + // - four i64 extracts, + // - two i64 inserts, and + // - two muls. + // So, for a v2i64 with LT.First = 1 the cost is 8, and for a v4i64 with + // LT.first = 2 the cost is 16. + return LT.first * 8; + case ISD::ADD: case ISD::XOR: case ISD::OR: case ISD::AND: diff --git a/llvm/test/Analysis/CostModel/AArch64/mul.ll b/llvm/test/Analysis/CostModel/AArch64/mul.ll --- a/llvm/test/Analysis/CostModel/AArch64/mul.ll +++ b/llvm/test/Analysis/CostModel/AArch64/mul.ll @@ -113,7 +113,7 @@ define <2 x i64> @t13(<2 x i64> %a, <2 x i64> %b) { ; THROUGHPUT-LABEL: 't13' -; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = mul nsw <2 x i64> %a, %b +; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %1 = mul nsw <2 x i64> %a, %b ; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %1 ; %1 = mul nsw <2 x i64> %a, %b @@ -122,7 +122,7 @@ define <4 x i64> @t14(<4 x i64> %a, <4 x i64> %b) { ; THROUGHPUT-LABEL: 't14' -; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = mul nsw <4 x i64> %a, %b +; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %1 = mul nsw <4 x i64> %a, %b ; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %1 ; %1 = mul nsw <4 x i64> %a, %b diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll @@ -9,8 +9,8 @@ ; leaving cost 3 for scalarizing the result + 2 for executing the op with VF 2. ; CM: LV: Scalar loop costs: 7. -; CM: LV: Found an estimated cost of 5 for VF 2 For instruction: %a = extractvalue { i64, i64 } %sv, 0 -; CM-NEXT: LV: Found an estimated cost of 5 for VF 2 For instruction: %b = extractvalue { i64, i64 } %sv, 1 +; CM: LV: Found an estimated cost of 19 for VF 2 For instruction: %a = extractvalue { i64, i64 } %sv, 0 +; CM-NEXT: LV: Found an estimated cost of 19 for VF 2 For instruction: %b = extractvalue { i64, i64 } %sv, 1 ; Check that the extractvalue operands are actually free in vector code. diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/mul.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/mul.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/mul.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/mul.ll @@ -27,8 +27,7 @@ ; str q0, [x0] ; ret ; -; but if we don't SLP vectorise these examples we get this which is smaller -; and faster: +; If we don't SLP vectorise but scalarize this we get this instead: ; ; ldp x8, x9, [x1] ; ldp x10, x11, [x0] @@ -37,20 +36,19 @@ ; stp x8, x9, [x0] ; ret ; -; FIXME: don't SLP vectorise this. - define void @mul(i64* noalias nocapture %a, i64* noalias nocapture readonly %b) { ; CHECK-LABEL: @mul( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[B:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i64* [[B]] to <2 x i64>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8 -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64* [[A]] to <2 x i64>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[TMP2]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <2 x i64> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[A]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64* [[B:%.*]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i64, i64* [[A:%.*]], align 8 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i64 [[TMP1]], [[TMP0]] +; CHECK-NEXT: store i64 [[MUL]], i64* [[A]], align 8 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, i64* [[ARRAYIDX2]], align 8 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, i64* [[ARRAYIDX3]], align 8 +; CHECK-NEXT: [[MUL4:%.*]] = mul nsw i64 [[TMP3]], [[TMP2]] +; CHECK-NEXT: store i64 [[MUL4]], i64* [[ARRAYIDX3]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -79,16 +77,18 @@ define void @mac(i64* noalias nocapture %a, i64* noalias nocapture readonly %b) { ; CHECK-LABEL: @mac( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[B:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i64* [[B]] to <2 x i64>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8 -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64* [[A]] to <2 x i64>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[TMP2]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <2 x i64> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i64> [[TMP4]], [[TMP1]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[A]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP5]], <2 x i64>* [[TMP6]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64* [[B:%.*]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i64, i64* [[A:%.*]], align 8 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i64 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, i64* [[ARRAYIDX2]], align 8 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, i64* [[ARRAYIDX3]], align 8 +; CHECK-NEXT: [[MUL4:%.*]] = mul nsw i64 [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[MUL]], [[TMP0]] +; CHECK-NEXT: store i64 [[ADD]], i64* [[A]], align 8 +; CHECK-NEXT: [[ADD9:%.*]] = add nsw i64 [[MUL4]], [[TMP2]] +; CHECK-NEXT: store i64 [[ADD9]], i64* [[ARRAYIDX3]], align 8 ; CHECK-NEXT: ret void ; entry: