diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -88,10 +88,10 @@ NewCost = VectorOpCost + ExtractCost + !Ext0->hasOneUse() * ExtractCost + !Ext1->hasOneUse() * ExtractCost; } - // TODO: The cost comparison should not differ based on opcode. Either we - // want to be uniformly more or less aggressive in deciding if a vector - // operation should replace the scalar operation. - return IsBinOp ? OldCost <= NewCost : OldCost < NewCost; + // Aggressively form a vector op if the cost is equal because the transform + // may enable further optimization. + // Codegen can reverse this transform (scalarize) if it was not profitable. + return OldCost < NewCost; } /// Try to reduce extract element costs by converting scalar compares to vector diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll --- a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll @@ -74,14 +74,15 @@ ret i8 %r } -; Negative test - extracts are free and vector op has same cost as scalar. +; Extracts are free and vector op has same cost as scalar, but we +; speculatively transform to vector to create more optimization +; opportunities.. define double @ext0_ext0_fadd(<2 x double> %x, <2 x double> %y) { ; CHECK-LABEL: @ext0_ext0_fadd( -; CHECK-NEXT: [[E0:%.*]] = extractelement <2 x double> [[X:%.*]], i32 0 -; CHECK-NEXT: [[E1:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 0 -; CHECK-NEXT: [[R:%.*]] = fadd double [[E0]], [[E1]] -; CHECK-NEXT: ret double [[R]] +; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 +; CHECK-NEXT: ret double [[TMP2]] ; %e0 = extractelement <2 x double> %x, i32 0 %e1 = extractelement <2 x double> %y, i32 0 @@ -118,14 +119,14 @@ ret double %r } -; Negative test - disguised same vector operand; scalar code is cheaper than general case. +; Disguised same vector operand; scalar code is not cheaper (with default +; x86 target), so aggressively form vector binop. define i32 @ext1_ext1_add_same_vec(<4 x i32> %x) { ; CHECK-LABEL: @ext1_ext1_add_same_vec( -; CHECK-NEXT: [[E0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 1 -; CHECK-NEXT: [[E1:%.*]] = extractelement <4 x i32> [[X]], i32 1 -; CHECK-NEXT: [[R:%.*]] = add i32 [[E0]], [[E1]] -; CHECK-NEXT: ret i32 [[R]] +; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[X:%.*]], [[X]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: ret i32 [[TMP2]] ; %e0 = extractelement <4 x i32> %x, i32 1 %e1 = extractelement <4 x i32> %x, i32 1 @@ -133,13 +134,13 @@ ret i32 %r } -; Negative test - same vector operand; scalar code is cheaper than general case. +; Functionally equivalent to above test; should transform as above. define i32 @ext1_ext1_add_same_vec_cse(<4 x i32> %x) { ; CHECK-LABEL: @ext1_ext1_add_same_vec_cse( -; CHECK-NEXT: [[E0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 1 -; CHECK-NEXT: [[R:%.*]] = add i32 [[E0]], [[E0]] -; CHECK-NEXT: ret i32 [[R]] +; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[X:%.*]], [[X]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: ret i32 [[TMP2]] ; %e0 = extractelement <4 x i32> %x, i32 1 %r = add i32 %e0, %e0 @@ -200,15 +201,15 @@ ret i8 %r } -; Negative test - vector code would not be cheaper. +; Vector code costs the same as scalar, so aggressively form vector op. define i8 @ext1_ext1_add_uses1(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: @ext1_ext1_add_uses1( ; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0 ; CHECK-NEXT: call void @use_i8(i8 [[E0]]) -; CHECK-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 0 -; CHECK-NEXT: [[R:%.*]] = add i8 [[E0]], [[E1]] -; CHECK-NEXT: ret i8 [[R]] +; CHECK-NEXT: [[TMP1:%.*]] = add <16 x i8> [[X]], [[Y:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0 +; CHECK-NEXT: ret i8 [[TMP2]] ; %e0 = extractelement <16 x i8> %x, i32 0 call void @use_i8(i8 %e0) @@ -217,15 +218,15 @@ ret i8 %r } -; Negative test - vector code would not be cheaper. +; Vector code costs the same as scalar, so aggressively form vector op. define i8 @ext1_ext1_add_uses2(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: @ext1_ext1_add_uses2( -; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0 ; CHECK-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 0 ; CHECK-NEXT: call void @use_i8(i8 [[E1]]) -; CHECK-NEXT: [[R:%.*]] = add i8 [[E0]], [[E1]] -; CHECK-NEXT: ret i8 [[R]] +; CHECK-NEXT: [[TMP1:%.*]] = add <16 x i8> [[X:%.*]], [[Y]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0 +; CHECK-NEXT: ret i8 [[TMP2]] ; %e0 = extractelement <16 x i8> %x, i32 0 %e1 = extractelement <16 x i8> %y, i32 0