diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -1383,6 +1383,22 @@ return Insert(UnOp, Name); } + Value *CreateNAryOp(unsigned Opc, ArrayRef Ops, + const Twine &Name = "", + MDNode *FPMathTag = nullptr) { + if (Instruction::isBinaryOp(Opc)) { + assert(Ops.size() == 2 && "Invalid number of operands!"); + return CreateBinOp(static_cast(Opc), + Ops[0], Ops[1], Name, FPMathTag); + } + if (Instruction::isUnaryOp(Opc)) { + assert(Ops.size() == 1 && "Invalid number of operands!"); + return CreateUnOp(static_cast(Opc), + Ops[0], Name, FPMathTag); + } + llvm_unreachable("Unexpected opcode!"); + } + //===--------------------------------------------------------------------===// // Instruction creation methods: Memory Instructions //===--------------------------------------------------------------------===// diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3969,6 +3969,7 @@ case Instruction::FAdd: case Instruction::Sub: case Instruction::FSub: + case Instruction::FNeg: case Instruction::Mul: case Instruction::FMul: case Instruction::FDiv: @@ -3979,21 +3980,22 @@ case Instruction::And: case Instruction::Or: case Instruction::Xor: { - // Just widen binops. - auto *BinOp = cast(&I); - setDebugLocFromInst(Builder, BinOp); + // Just widen unops and binops. + setDebugLocFromInst(Builder, &I); for (unsigned Part = 0; Part < UF; ++Part) { - Value *A = getOrCreateVectorValue(BinOp->getOperand(0), Part); - Value *B = getOrCreateVectorValue(BinOp->getOperand(1), Part); - Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B); + SmallVector Ops; + for (Value *Op : I.operands()) + Ops.push_back(getOrCreateVectorValue(Op, Part)); + + Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); - if (BinaryOperator *VecOp = dyn_cast(V)) - VecOp->copyIRFlags(BinOp); + if (auto *VecOp = dyn_cast(V)) + VecOp->copyIRFlags(&I); // Use this vector value for all users of the original instruction. VectorLoopValueMap.setVectorValue(&I, Part, V); - addMetadata(V, BinOp); + addMetadata(V, &I); } break; @@ -5960,6 +5962,14 @@ I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands); } + case Instruction::FNeg: { + unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; + return N * TTI.getArithmeticInstrCost( + I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, + TargetTransformInfo::OK_AnyValue, + TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, + I->getOperand(0)); + } case Instruction::Select: { SelectInst *SI = cast(I); const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); @@ -6589,6 +6599,7 @@ case Instruction::FCmp: case Instruction::FDiv: case Instruction::FMul: + case Instruction::FNeg: case Instruction::FPExt: case Instruction::FPToSI: case Instruction::FPToUI: diff --git a/llvm/test/Transforms/LoopVectorize/X86/fneg-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/fneg-cost.ll --- a/llvm/test/Transforms/LoopVectorize/X86/fneg-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/fneg-cost.ll @@ -5,9 +5,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" -; CHECK: Found an estimated cost of 2 for VF 1 For instruction: %neg = fneg float %{{.*}} -; CHECK: Found an estimated cost of 6 for VF 2 For instruction: %neg = fneg float %{{.*}} -; CHECK: Found an estimated cost of 14 for VF 4 For instruction: %neg = fneg float %{{.*}} +; CHECK: Found an estimated cost of 4 for VF 1 For instruction: %neg = fneg float %{{.*}} +; CHECK: Found an estimated cost of 4 for VF 2 For instruction: %neg = fneg float %{{.*}} +; CHECK: Found an estimated cost of 4 for VF 4 For instruction: %neg = fneg float %{{.*}} define void @fneg_cost(float* %a, i64 %n) { entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/fneg.ll b/llvm/test/Transforms/LoopVectorize/fneg.ll --- a/llvm/test/Transforms/LoopVectorize/fneg.ll +++ b/llvm/test/Transforms/LoopVectorize/fneg.ll @@ -3,19 +3,8 @@ define void @foo(float* %a, i64 %n) { ; CHECK: vector.body: ; CHECK: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* {{.*}}, align 4 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = fneg float [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fneg float [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 2 -; CHECK-NEXT: [[TMP9:%.*]] = fneg float [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 3 -; CHECK-NEXT: [[TMP11:%.*]] = fneg float [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x float> undef, float [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[TMP7]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP9]], i32 2 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP11]], i32 3 -; CHECK: store <4 x float> [[TMP15]], <4 x float>* {{.*}}, align 4 +; CHECK-NEXT: [[TMP4:%.*]] = fneg <4 x float> [[WIDE_LOAD]] +; CHECK: store <4 x float> [[TMP4]], <4 x float>* {{.*}}, align 4 ; entry: br label %for.body