Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -493,32 +493,49 @@ int ISD = TLI->InstructionOpcodeToISD(Opcode); - if (ISD == ISD::SDIV && - Opd2Info == TargetTransformInfo::OK_UniformConstantValue && - Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { - // On AArch64, scalar signed division by constants power-of-two are - // normally expanded to the sequence ADD + CMP + SELECT + SRA. - // The OperandValue properties many not be same as that of previous - // operation; conservatively assume OP_None. - Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info, - TargetTransformInfo::OP_None, - TargetTransformInfo::OP_None); - Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info, - TargetTransformInfo::OP_None, - TargetTransformInfo::OP_None); - Cost += getArithmeticInstrCost(Instruction::Select, Ty, Opd1Info, Opd2Info, - TargetTransformInfo::OP_None, - TargetTransformInfo::OP_None); - Cost += getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, Opd2Info, - TargetTransformInfo::OP_None, - TargetTransformInfo::OP_None); - return Cost; - } - switch (ISD) { default: return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo); + case ISD::SDIV: + if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue && + Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { + // On AArch64, scalar signed division by constants power-of-two are + // normally expanded to the sequence ADD + CMP + SELECT + SRA. + // The OperandValue properties many not be same as that of previous + // operation; conservatively assume OP_None. + Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + Cost += getArithmeticInstrCost(Instruction::Select, Ty, Opd1Info, Opd2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + Cost += getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, Opd2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + return Cost; + } + LLVM_FALLTHROUGH; + case ISD::UDIV: + Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, + Opd1PropInfo, Opd2PropInfo); + if (Ty->isVectorTy()) { + // On AArch64, vector divisions are not supported natively and are + // expanded into scalar divisions of each pair of elements. + // Conservatively assume OP_None. + Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, Opd1Info, + Opd2Info, Opd1PropInfo, Opd2PropInfo); + Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, Opd1Info, + Opd2Info, Opd1PropInfo, Opd2PropInfo); + // TODO: if one of the arguments is a scalar, then it's not necessary to + // double the cost of handling the vector elements. + Cost += Cost; + } + return Cost; + case ISD::ADD: case ISD::MUL: case ISD::XOR: Index: llvm/test/Analysis/CostModel/AArch64/vdiv.ll =================================================================== --- /dev/null +++ llvm/test/Analysis/CostModel/AArch64/vdiv.ll @@ -0,0 +1,44 @@ +; RUN: opt < %s -O3 -debug-only=loop-vectorize 2> %t; FileCheck %s --input-file %t + +; CHECK: LV: Scalar loop costs: {{[0-9]}}. +; CHECK: LV: Vector loop of width 2 costs: {{[0-9]+}}. +; CHECK: LV: Vector loop of width 4 costs: {{[0-9]+}}. +; CHECK: LV: Selecting VF: 1. + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +define void @scale(i32* noalias nocapture %p, i32* noalias nocapture readonly %scale, i32 %width) #0 { +entry: + %cmp = icmp ugt i32 %width, 3 + %rem = and i32 %width, 3 + %cmp1 = icmp eq i32 %rem, 0 + %or.cond17 = and i1 %cmp, %cmp1 + br i1 %or.cond17, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %width to i64 + br label %for.body + +for.body: ; preds = %for.body, %for.body.preheader + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %p, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4, !tbaa !2 + %arrayidx4 = getelementptr inbounds i32, i32* %scale, i64 %indvars.iv + %1 = load i32, i32* %arrayidx4, align 4, !tbaa !2 + %div = sdiv i32 %0, %1 + store i32 %div, i32* %arrayidx, align 4, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +attributes #0 = { norecurse nounwind } + +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"}