diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h --- a/llvm/include/llvm/Analysis/ValueTracking.h +++ b/llvm/include/llvm/Analysis/ValueTracking.h @@ -463,6 +463,28 @@ const DominatorTree *DT = nullptr, const TargetLibraryInfo *TLI = nullptr); + /// This returns the same result as isSafeToSpeculativelyExecute if Opcode is + /// the actual opcode of Inst. If the provided and actual opcode differ, the + /// function (virtually) overrides the opcode of Inst with the provided + /// Opcode. There are come constraints in this case: + /// * If Opcode has a fixed number of operands (eg, as binary operators do), + /// then Inst has to have at least as many leading operands. The function + /// will ignore all trailing operands beyond that number. + /// * If Opcode allows for an arbitrary number of operands (eg, as CallInsts + /// do), then all operands are considered. + /// * The virtual instruction has to satisfy all typing rules of the provided + /// Opcode. + /// * This function is pessimistic in the following sense: If one actually + /// materialized the virtual instruction, then isSafeToSpeculativelyExecute + /// may say that the materialized instruction is speculatable whereas this + /// function may have said that the instruction wouldn't be speculatable. + /// This behavior is a shortcoming in the current implementation and not + /// intentional. + bool isSafeToSpeculativelyExecuteWithOpcode( + unsigned Opcode, const Operator *Inst, const Instruction *CtxI = nullptr, + const DominatorTree *DT = nullptr, + const TargetLibraryInfo *TLI = nullptr); + /// Returns true if the result or effects of the given instructions \p I /// depend values not reachable through the def use graph. /// * Memory dependence arises for example if the instruction reads from diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1415,11 +1415,11 @@ [ IntrReadMem, IntrNoSync, IntrWillReturn, IntrArgMemOnly ]>; def int_vp_scatter: DefaultAttrsIntrinsic<[], - [ llvm_anyvector_ty, - LLVMVectorOfAnyPointersToElt<0>, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty], - [ IntrArgMemOnly, IntrNoSync, IntrWillReturn ]>; // TODO allow IntrNoCapture for vectors of pointers + [ llvm_anyvector_ty, + LLVMVectorOfAnyPointersToElt<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty], + [ IntrArgMemOnly, IntrNoSync, IntrWillReturn ]>; // TODO allow IntrNoCapture for vectors of pointers // Experimental strided memory accesses def int_experimental_vp_strided_store : DefaultAttrsIntrinsic<[], @@ -1437,8 +1437,9 @@ llvm_i32_ty], [ NoCapture>, IntrNoSync, IntrReadMem, IntrWillReturn, IntrArgMemOnly ]>; -// Speculatable Binary operators -let IntrProperties = [IntrSpeculatable, IntrNoMem, IntrNoSync, IntrWillReturn] in { +// Operators +let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in { + // Integer arithmetic def int_vp_add : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], [ LLVMMatchType<0>, LLVMMatchType<0>, @@ -1450,30 +1451,30 @@ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_i32_ty]>; def int_vp_mul : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], - [ LLVMMatchType<0>, - LLVMMatchType<0>, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; def int_vp_ashr : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], - [ LLVMMatchType<0>, - LLVMMatchType<0>, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; def int_vp_lshr : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], - [ LLVMMatchType<0>, - LLVMMatchType<0>, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; def int_vp_shl : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], [ LLVMMatchType<0>, LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_i32_ty]>; def int_vp_or : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], - [ LLVMMatchType<0>, - LLVMMatchType<0>, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; def int_vp_and : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], [ LLVMMatchType<0>, LLVMMatchType<0>, @@ -1484,35 +1485,28 @@ LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_i32_ty]>; -} - -// Non-speculatable binary operators. -let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in { def int_vp_sdiv : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], - [ LLVMMatchType<0>, - LLVMMatchType<0>, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; def int_vp_udiv : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], - [ LLVMMatchType<0>, - LLVMMatchType<0>, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; def int_vp_srem : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], - [ LLVMMatchType<0>, - LLVMMatchType<0>, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; def int_vp_urem : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], - [ LLVMMatchType<0>, - LLVMMatchType<0>, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; -} + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; -// Floating-point arithmetic. -let IntrProperties = - [IntrSpeculatable, IntrNoMem, IntrNoSync, IntrWillReturn] in { + // Floating-point arithmetic def int_vp_fadd : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], [ LLVMMatchType<0>, LLVMMatchType<0>, @@ -1524,177 +1518,169 @@ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_i32_ty]>; def int_vp_fmul : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], - [ LLVMMatchType<0>, - LLVMMatchType<0>, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; def int_vp_fdiv : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], - [ LLVMMatchType<0>, - LLVMMatchType<0>, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; def int_vp_frem : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], - [ LLVMMatchType<0>, - LLVMMatchType<0>, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; - + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; def int_vp_fneg : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], - [ LLVMMatchType<0>, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; - + [ LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; def int_vp_fma : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], - [ LLVMMatchType<0>, - LLVMMatchType<0>, - LLVMMatchType<0>, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; -} + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; -// Casts. -def int_vp_trunc : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], - [ llvm_anyvector_ty, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; -def int_vp_zext : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], - [ llvm_anyvector_ty, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; -def int_vp_sext : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], - [ llvm_anyvector_ty, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; -def int_vp_fptrunc : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], - [ llvm_anyvector_ty, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; -def int_vp_fpext : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], - [ llvm_anyvector_ty, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; -def int_vp_fptoui : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], - [ llvm_anyvector_ty, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; -def int_vp_fptosi : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], - [ llvm_anyvector_ty, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; -def int_vp_uitofp : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], - [ llvm_anyvector_ty, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; -def int_vp_sitofp : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], - [ llvm_anyvector_ty, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; -def int_vp_ptrtoint : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], - [ llvm_anyvector_ty, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; -def int_vp_inttoptr : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], - [ llvm_anyvector_ty, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; - -// Shuffles. -def int_vp_select : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], - [ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - LLVMMatchType<0>, - LLVMMatchType<0>, - llvm_i32_ty]>; - -def int_vp_merge : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], - [ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - LLVMMatchType<0>, - LLVMMatchType<0>, - llvm_i32_ty]>; - -// Comparisons. -let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in { - def int_vp_fcmp : DefaultAttrsIntrinsic<[ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty> ], - [ llvm_anyvector_ty, - LLVMMatchType<0>, - llvm_metadata_ty, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; + // Casts + def int_vp_trunc : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], + [ llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_zext : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], + [ llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_sext : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], + [ llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_fptrunc : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], + [ llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_fpext : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], + [ llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_fptoui : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], + [ llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_fptosi : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], + [ llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_uitofp : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], + [ llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_sitofp : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], + [ llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_ptrtoint : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], + [ llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_inttoptr : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], + [ llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + // Shuffles + def int_vp_select : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], + [ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + LLVMMatchType<0>, + LLVMMatchType<0>, + llvm_i32_ty]>; + def int_vp_merge : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], + [ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + LLVMMatchType<0>, + LLVMMatchType<0>, + llvm_i32_ty]>; + + // Comparisons + def int_vp_fcmp : DefaultAttrsIntrinsic<[ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty> ], + [ llvm_anyvector_ty, + LLVMMatchType<0>, + llvm_metadata_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; def int_vp_icmp : DefaultAttrsIntrinsic<[ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty> ], - [ llvm_anyvector_ty, - LLVMMatchType<0>, - llvm_metadata_ty, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; -} + [ llvm_anyvector_ty, + LLVMMatchType<0>, + llvm_metadata_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; -// Reductions -let IntrProperties = [IntrSpeculatable, IntrNoMem, IntrNoSync, IntrWillReturn] in { + // Reductions def int_vp_reduce_fadd : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], - [LLVMVectorElementType<0>, - llvm_anyvector_ty, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; + [ LLVMVectorElementType<0>, + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; def int_vp_reduce_fmul : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], - [LLVMVectorElementType<0>, - llvm_anyvector_ty, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; + [ LLVMVectorElementType<0>, + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; def int_vp_reduce_add : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], - [LLVMVectorElementType<0>, - llvm_anyvector_ty, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; + [ LLVMVectorElementType<0>, + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; def int_vp_reduce_mul : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], - [LLVMVectorElementType<0>, - llvm_anyvector_ty, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; + [ LLVMVectorElementType<0>, + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; def int_vp_reduce_and : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], - [LLVMVectorElementType<0>, - llvm_anyvector_ty, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; + [ LLVMVectorElementType<0>, + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; def int_vp_reduce_or : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], - [LLVMVectorElementType<0>, - llvm_anyvector_ty, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; + [ LLVMVectorElementType<0>, + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; def int_vp_reduce_xor : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], - [LLVMVectorElementType<0>, - llvm_anyvector_ty, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; + [ LLVMVectorElementType<0>, + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; def int_vp_reduce_smax : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], - [LLVMVectorElementType<0>, - llvm_anyvector_ty, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; + [ LLVMVectorElementType<0>, + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; def int_vp_reduce_smin : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], - [LLVMVectorElementType<0>, - llvm_anyvector_ty, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; + [ LLVMVectorElementType<0>, + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; def int_vp_reduce_umax : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], - [LLVMVectorElementType<0>, - llvm_anyvector_ty, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; + [ LLVMVectorElementType<0>, + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; def int_vp_reduce_umin : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], - [LLVMVectorElementType<0>, - llvm_anyvector_ty, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; + [ LLVMVectorElementType<0>, + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; def int_vp_reduce_fmax : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], - [LLVMVectorElementType<0>, - llvm_anyvector_ty, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; + [ LLVMVectorElementType<0>, + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; def int_vp_reduce_fmin : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], - [LLVMVectorElementType<0>, - llvm_anyvector_ty, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_i32_ty]>; + [ LLVMVectorElementType<0>, + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; } def int_get_active_lane_mask: diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -4580,13 +4580,38 @@ const Operator *Inst = dyn_cast(V); if (!Inst) return false; + return isSafeToSpeculativelyExecuteWithOpcode(Inst->getOpcode(), Inst, CtxI, DT, TLI); +} + +bool llvm::isSafeToSpeculativelyExecuteWithOpcode(unsigned Opcode, + const Operator *Inst, + const Instruction *CtxI, + const DominatorTree *DT, + const TargetLibraryInfo *TLI) { + if (Inst->getOpcode() != Opcode) { + // Check that the operands are actually compatible with the Opcode override. + auto hasEqualReturnAndLeadingOperandTypes = + [](const Operator *Inst, unsigned NumLeadingOperands) { + if (Inst->getNumOperands() < NumLeadingOperands) + return false; + const Type *ExpectedType = Inst->getType(); + for (unsigned ItOp = 0; ItOp < NumLeadingOperands; ++ItOp) + if (Inst->getOperand(ItOp)->getType() != ExpectedType) + return false; + return true; + }; + assert(!Instruction::isBinaryOp(Opcode) || + hasEqualReturnAndLeadingOperandTypes(Inst, 2)); + assert(!Instruction::isUnaryOp(Opcode) || + hasEqualReturnAndLeadingOperandTypes(Inst, 1)); + } for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i) if (Constant *C = dyn_cast(Inst->getOperand(i))) if (C->canTrap()) return false; - switch (Inst->getOpcode()) { + switch (Opcode) { default: return true; case Instruction::UDiv: @@ -4617,7 +4642,9 @@ return false; } case Instruction::Load: { - const LoadInst *LI = cast(Inst); + const LoadInst *LI = dyn_cast(Inst); + if (!LI) + return false; if (mustSuppressSpeculation(*LI)) return false; const DataLayout &DL = LI->getModule()->getDataLayout(); @@ -4626,7 +4653,9 @@ TLI); } case Instruction::Call: { - auto *CI = cast(Inst); + auto *CI = dyn_cast(Inst); + if (!CI) + return false; const Function *Callee = CI->getCalledFunction(); // The called function could have undefined behavior or side-effects, even diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp --- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp +++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp @@ -11,10 +11,10 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/ExpandVectorPredication.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/ExpandVectorPredication.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" @@ -113,6 +113,19 @@ OldOp.eraseFromParent(); } +static bool maySpeculateLanes(VPIntrinsic &VPI) { + // Eventhough non-VP reduction intrinsics are speculatable, their mask bits + // are not. + if (isa(VPI)) + return false; + // Fallback to whether the function operation is safe to speculatively + // execute. + Optional OpcOpt = VPI.getFunctionalOpcode(); + unsigned FunctionalOpc = OpcOpt.getValueOr((unsigned)Instruction::Call); + return isSafeToSpeculativelyExecuteWithOpcode(FunctionalOpc, + cast(&VPI)); +} + //// } Helpers namespace { @@ -216,8 +229,7 @@ Value * CachingVPExpander::expandPredicationInBinaryOperator(IRBuilder<> &Builder, VPIntrinsic &VPI) { - assert((isSafeToSpeculativelyExecute(&VPI) || - VPI.canIgnoreVectorLengthParam()) && + assert((maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam()) && "Implicitly dropping %evl in non-speculatable operator!"); auto OC = static_cast(*VPI.getFunctionalOpcode()); @@ -296,8 +308,7 @@ Value * CachingVPExpander::expandPredicationInReduction(IRBuilder<> &Builder, VPReductionIntrinsic &VPI) { - assert((isSafeToSpeculativelyExecute(&VPI) || - VPI.canIgnoreVectorLengthParam()) && + assert((maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam()) && "Implicitly dropping %evl in non-speculatable operator!"); Value *Mask = VPI.getMaskParam(); @@ -471,9 +482,9 @@ bool isDone() const { return Strategy.shouldDoNothing(); } }; -void sanitizeStrategy(Instruction &I, VPLegalization &LegalizeStrat) { - // Speculatable instructions do not strictly need predication. - if (isSafeToSpeculativelyExecute(&I)) { +void sanitizeStrategy(VPIntrinsic &VPI, VPLegalization &LegalizeStrat) { + // Operations with speculatable lanes do not strictly need predication. + if (maySpeculateLanes(VPI)) { // Converting a speculatable VP intrinsic means dropping %mask and %evl. // No need to expand %evl into the %mask only to ignore that code. if (LegalizeStrat.OpStrategy == VPLegalization::Convert) @@ -518,7 +529,7 @@ if (!VPI) continue; auto VPStrat = getVPLegalizationStrategy(*VPI); - sanitizeStrategy(I, VPStrat); + sanitizeStrategy(*VPI, VPStrat); if (!VPStrat.shouldDoNothing()) Worklist.emplace_back(VPI, VPStrat); } diff --git a/llvm/test/CodeGen/Generic/expand-vp.ll b/llvm/test/CodeGen/Generic/expand-vp.ll --- a/llvm/test/CodeGen/Generic/expand-vp.ll +++ b/llvm/test/CodeGen/Generic/expand-vp.ll @@ -2,7 +2,7 @@ ; RUN: opt --expandvp --expandvp-override-evl-transform=Legal --expandvp-override-mask-transform=Legal -S < %s | FileCheck %s --check-prefix=LEGAL_LEGAL ; RUN: opt --expandvp --expandvp-override-evl-transform=Discard --expandvp-override-mask-transform=Legal -S < %s | FileCheck %s --check-prefix=DISCARD_LEGAL ; RUN: opt --expandvp --expandvp-override-evl-transform=Convert --expandvp-override-mask-transform=Legal -S < %s | FileCheck %s --check-prefix=CONVERT_LEGAL -; Full expansion cases (all expanded to non-VP). +; Call expansion cases (all expanded to non-VP). ; RUN: opt --expandvp --expandvp-override-evl-transform=Discard --expandvp-override-mask-transform=Convert -S < %s | FileCheck %s --check-prefix=ALL-CONVERT ; RUN: opt --expandvp -S < %s | FileCheck %s --check-prefix=ALL-CONVERT ; RUN: opt --expandvp --expandvp-override-evl-transform=Legal --expandvp-override-mask-transform=Convert -S < %s | FileCheck %s --check-prefix=ALL-CONVERT @@ -166,62 +166,70 @@ ; Check that reductions use the correct neutral element for masked-off elements ; ALL-CONVERT: define void @test_vp_reduce_int_v4(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) { -; ALL-CONVERT-NEXT: [[ADD:%.+]] = select <4 x i1> %m, <4 x i32> %vi, <4 x i32> zeroinitializer +; ALL-CONVERT-NEXT: [[NINS:%.+]] = insertelement <4 x i32> poison, i32 %n, i32 0 +; ALL-CONVERT-NEXT: [[NSPLAT:%.+]] = shufflevector <4 x i32> [[NINS]], <4 x i32> poison, <4 x i32> zeroinitializer +; ALL-CONVERT-NEXT: [[EVLM:%.+]] = icmp ult <4 x i32> , [[NSPLAT]] +; ALL-CONVERT-NEXT: [[NEWM:%.+]] = and <4 x i1> [[EVLM]], %m +; ALL-CONVERT-NEXT: [[ADD:%.+]] = select <4 x i1> [[NEWM]], <4 x i32> %vi, <4 x i32> zeroinitializer ; ALL-CONVERT-NEXT: [[RED:%.+]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[ADD]]) ; ALL-CONVERT-NEXT: %{{.+}} = add i32 [[RED]], %start -; ALL-CONVERT-NEXT: [[MUL:%.+]] = select <4 x i1> %m, <4 x i32> %vi, <4 x i32> +; ALL-CONVERT: [[MUL:%.+]] = select <4 x i1> %{{.+}}, <4 x i32> %vi, <4 x i32> ; ALL-CONVERT-NEXT: [[RED:%.+]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[MUL]]) ; ALL-CONVERT-NEXT: %{{.+}} = mul i32 [[RED]], %start -; ALL-CONVERT-NEXT: [[AND:%.+]] = select <4 x i1> %m, <4 x i32> %vi, <4 x i32> +; ALL-CONVERT: [[AND:%.+]] = select <4 x i1> %{{.+}}, <4 x i32> %vi, <4 x i32> ; ALL-CONVERT-NEXT: [[RED:%.+]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[AND]]) ; ALL-CONVERT-NEXT: %{{.+}} = and i32 [[RED]], %start -; ALL-CONVERT-NEXT: [[OR:%.+]] = select <4 x i1> %m, <4 x i32> %vi, <4 x i32> zeroinitializer +; ALL-CONVERT: [[OR:%.+]] = select <4 x i1> %{{.+}}, <4 x i32> %vi, <4 x i32> zeroinitializer ; ALL-CONVERT-NEXT: [[RED:%.+]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[OR]]) ; ALL-CONVERT-NEXT: %{{.+}} = or i32 [[RED]], %start -; ALL-CONVERT-NEXT: [[XOR:%.+]] = select <4 x i1> %m, <4 x i32> %vi, <4 x i32> zeroinitializer +; ALL-CONVERT: [[XOR:%.+]] = select <4 x i1> %{{.+}}, <4 x i32> %vi, <4 x i32> zeroinitializer ; ALL-CONVERT-NEXT: [[RED:%.+]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[XOR]]) ; ALL-CONVERT-NEXT: %{{.+}} = xor i32 [[RED]], %start -; ALL-CONVERT-NEXT: [[SMIN:%.+]] = select <4 x i1> %m, <4 x i32> %vi, <4 x i32> +; ALL-CONVERT: [[SMIN:%.+]] = select <4 x i1> %{{.+}}, <4 x i32> %vi, <4 x i32> ; ALL-CONVERT-NEXT: [[RED:%.+]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[SMIN]]) ; ALL-CONVERT-NEXT: %{{.+}} = call i32 @llvm.smin.i32(i32 [[RED]], i32 %start) -; ALL-CONVERT-NEXT: [[SMAX:%.+]] = select <4 x i1> %m, <4 x i32> %vi, <4 x i32> +; ALL-CONVERT: [[SMAX:%.+]] = select <4 x i1> %{{.+}}, <4 x i32> %vi, <4 x i32> ; ALL-CONVERT-NEXT: [[RED:%.+]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[SMAX]]) ; ALL-CONVERT-NEXT: %{{.+}} = call i32 @llvm.smax.i32(i32 [[RED]], i32 %start) -; ALL-CONVERT-NEXT: [[UMIN:%.+]] = select <4 x i1> %m, <4 x i32> %vi, <4 x i32> +; ALL-CONVERT: [[UMIN:%.+]] = select <4 x i1> %{{.+}}, <4 x i32> %vi, <4 x i32> ; ALL-CONVERT-NEXT: [[RED:%.+]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[UMIN]]) ; ALL-CONVERT-NEXT: %{{.+}} = call i32 @llvm.umin.i32(i32 [[RED]], i32 %start) -; ALL-CONVERT-NEXT: [[UMAX:%.+]] = select <4 x i1> %m, <4 x i32> %vi, <4 x i32> zeroinitializer +; ALL-CONVERT: [[UMAX:%.+]] = select <4 x i1> %{{.+}}, <4 x i32> %vi, <4 x i32> zeroinitializer ; ALL-CONVERT-NEXT: [[RED:%.+]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[UMAX]]) ; ALL-CONVERT-NEXT: %{{.+}} = call i32 @llvm.umax.i32(i32 [[RED]], i32 %start) ; ALL-CONVERT-NEXT: ret void ; Check that reductions use the correct neutral element for masked-off elements ; ALL-CONVERT: define void @test_vp_reduce_fp_v4(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) { -; ALL-CONVERT-NEXT: [[FMIN:%.+]] = select <4 x i1> %m, <4 x float> %vf, <4 x float> +; ALL-CONVERT-NEXT: [[NINS:%.+]] = insertelement <4 x i32> poison, i32 %n, i32 0 +; ALL-CONVERT-NEXT: [[NSPLAT:%.+]] = shufflevector <4 x i32> [[NINS]], <4 x i32> poison, <4 x i32> zeroinitializer +; ALL-CONVERT-NEXT: [[EVLM:%.+]] = icmp ult <4 x i32> , [[NSPLAT]] +; ALL-CONVERT-NEXT: [[NEWM:%.+]] = and <4 x i1> [[EVLM]], %m +; ALL-CONVERT-NEXT: [[FMIN:%.+]] = select <4 x i1> [[NEWM]], <4 x float> %vf, <4 x float> ; ALL-CONVERT-NEXT: [[RED:%.+]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[FMIN]]) ; ALL-CONVERT-NEXT: %{{.+}} = call float @llvm.minnum.f32(float [[RED]], float %f) -; ALL-CONVERT-NEXT: [[FMIN_NNAN:%.+]] = select <4 x i1> %m, <4 x float> %vf, <4 x float> +; ALL-CONVERT: [[FMIN_NNAN:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> ; ALL-CONVERT-NEXT: [[RED:%.+]] = call nnan float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[FMIN_NNAN]]) ; ALL-CONVERT-NEXT: %{{.+}} = call nnan float @llvm.minnum.f32(float [[RED]], float %f) -; ALL-CONVERT-NEXT: [[FMIN_NNAN_NINF:%.+]] = select <4 x i1> %m, <4 x float> %vf, <4 x float> +; ALL-CONVERT: [[FMIN_NNAN_NINF:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> ; ALL-CONVERT-NEXT: [[RED:%.+]] = call nnan ninf float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[FMIN_NNAN_NINF]]) ; ALL-CONVERT-NEXT: %{{.+}} = call nnan ninf float @llvm.minnum.f32(float [[RED]], float %f) -; ALL-CONVERT-NEXT: [[FMAX:%.+]] = select <4 x i1> %m, <4 x float> %vf, <4 x float> +; ALL-CONVERT: [[FMAX:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> ; ALL-CONVERT-NEXT: [[RED:%.+]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[FMAX]]) ; ALL-CONVERT-NEXT: %{{.+}} = call float @llvm.maxnum.f32(float [[RED]], float %f) -; ALL-CONVERT-NEXT: [[FMAX_NNAN:%.+]] = select <4 x i1> %m, <4 x float> %vf, <4 x float> +; ALL-CONVERT: [[FMAX_NNAN:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> ; ALL-CONVERT-NEXT: [[RED:%.+]] = call nnan float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[FMAX_NNAN]]) ; ALL-CONVERT-NEXT: %{{.+}} = call nnan float @llvm.maxnum.f32(float [[RED]], float %f) -; ALL-CONVERT-NEXT: [[FMAX_NNAN_NINF:%.+]] = select <4 x i1> %m, <4 x float> %vf, <4 x float> +; ALL-CONVERT: [[FMAX_NNAN_NINF:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> ; ALL-CONVERT-NEXT: [[RED:%.+]] = call nnan ninf float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[FMAX_NNAN_NINF]]) ; ALL-CONVERT-NEXT: %{{.+}} = call nnan ninf float @llvm.maxnum.f32(float [[RED]], float %f) -; ALL-CONVERT-NEXT: [[FADD:%.+]] = select <4 x i1> %m, <4 x float> %vf, <4 x float> +; ALL-CONVERT: [[FADD:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> ; ALL-CONVERT-NEXT: %{{.+}} = call float @llvm.vector.reduce.fadd.v4f32(float %f, <4 x float> [[FADD]]) -; ALL-CONVERT-NEXT: [[FADD:%.+]] = select <4 x i1> %m, <4 x float> %vf, <4 x float> +; ALL-CONVERT: [[FADD:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> ; ALL-CONVERT-NEXT: %{{.+}} = call reassoc float @llvm.vector.reduce.fadd.v4f32(float %f, <4 x float> [[FADD]]) -; ALL-CONVERT-NEXT: [[FMUL:%.+]] = select <4 x i1> %m, <4 x float> %vf, <4 x float> +; ALL-CONVERT: [[FMUL:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> ; ALL-CONVERT-NEXT: %{{.+}} = call float @llvm.vector.reduce.fmul.v4f32(float %f, <4 x float> [[FMUL]]) -; ALL-CONVERT-NEXT: [[FMUL:%.+]] = select <4 x i1> %m, <4 x float> %vf, <4 x float> +; ALL-CONVERT: [[FMUL:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> ; ALL-CONVERT-NEXT: %{{.+}} = call reassoc float @llvm.vector.reduce.fmul.v4f32(float %f, <4 x float> [[FMUL]]) ; ALL-CONVERT-NEXT: ret void @@ -332,29 +340,37 @@ ; DISCARD_LEGAL: ret void ; DISCARD_LEGAL: define void @test_vp_reduce_int_v4(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) { -; DISCARD_LEGAL-NEXT: %r0 = call i32 @llvm.vp.reduce.add.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) -; DISCARD_LEGAL-NEXT: %r1 = call i32 @llvm.vp.reduce.mul.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) -; DISCARD_LEGAL-NEXT: %r2 = call i32 @llvm.vp.reduce.and.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) -; DISCARD_LEGAL-NEXT: %r3 = call i32 @llvm.vp.reduce.or.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) -; DISCARD_LEGAL-NEXT: %r4 = call i32 @llvm.vp.reduce.xor.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) -; DISCARD_LEGAL-NEXT: %r5 = call i32 @llvm.vp.reduce.smin.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) -; DISCARD_LEGAL-NEXT: %r6 = call i32 @llvm.vp.reduce.smax.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) -; DISCARD_LEGAL-NEXT: %r7 = call i32 @llvm.vp.reduce.umin.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) -; DISCARD_LEGAL-NEXT: %r8 = call i32 @llvm.vp.reduce.umax.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) -; DISCARD_LEGAL-NEXT: ret void +; DISCARD_LEGAL-NEXT: [[NSPLATINS:%.+]] = insertelement <4 x i32> poison, i32 %n, i32 0 +; DISCARD_LEGAL-NEXT: [[NSPLAT:%.+]] = shufflevector <4 x i32> [[NSPLATINS]], <4 x i32> poison, <4 x i32> zeroinitializer +; DISCARD_LEGAL-NEXT: [[EVLMASK:%.+]] = icmp ult <4 x i32> , [[NSPLAT]] +; DISCARD_LEGAL-NEXT: [[NEWMASK:%.+]] = and <4 x i1> [[EVLMASK]], %m +; DISCARD_LEGAL-NEXT: %r0 = call i32 @llvm.vp.reduce.add.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> [[NEWMASK]], i32 4) +; DISCARD_LEGAL-NOT: %r1 = call i32 @llvm.vp.reduce.mul.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NOT: %r2 = call i32 @llvm.vp.reduce.and.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NOT: %r3 = call i32 @llvm.vp.reduce.or.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NOT: %r4 = call i32 @llvm.vp.reduce.xor.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NOT: %r5 = call i32 @llvm.vp.reduce.smin.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NOT: %r6 = call i32 @llvm.vp.reduce.smax.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NOT: %r7 = call i32 @llvm.vp.reduce.umin.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NOT: %r8 = call i32 @llvm.vp.reduce.umax.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) +; DISCARD_LEGAL: ret void ; DISCARD_LEGAL: define void @test_vp_reduce_fp_v4(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) { -; DISCARD_LEGAL-NEXT: %r0 = call float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) -; DISCARD_LEGAL-NEXT: %r1 = call nnan float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) -; DISCARD_LEGAL-NEXT: %r2 = call nnan ninf float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) -; DISCARD_LEGAL-NEXT: %r3 = call float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) -; DISCARD_LEGAL-NEXT: %r4 = call nnan float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) -; DISCARD_LEGAL-NEXT: %r5 = call nnan ninf float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) -; DISCARD_LEGAL-NEXT: %r6 = call float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) -; DISCARD_LEGAL-NEXT: %r7 = call reassoc float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) -; DISCARD_LEGAL-NEXT: %r8 = call float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) -; DISCARD_LEGAL-NEXT: %r9 = call reassoc float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) -; DISCARD_LEGAL-NEXT: ret void +; DISCARD_LEGAL-NEXT: [[NSPLATINS:%.+]] = insertelement <4 x i32> poison, i32 %n, i32 0 +; DISCARD_LEGAL-NEXT: [[NSPLAT:%.+]] = shufflevector <4 x i32> [[NSPLATINS]], <4 x i32> poison, <4 x i32> zeroinitializer +; DISCARD_LEGAL-NEXT: [[EVLMASK:%.+]] = icmp ult <4 x i32> , [[NSPLAT]] +; DISCARD_LEGAL-NEXT: [[NEWMASK:%.+]] = and <4 x i1> [[EVLMASK]], %m +; DISCARD_LEGAL-NEXT: %r0 = call float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> [[NEWMASK]], i32 4) +; DISCARD_LEGAL-NOT: %r1 = call nnan float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NOT: %r2 = call nnan ninf float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NOT: %r3 = call float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NOT: %r4 = call nnan float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NOT: %r5 = call nnan ninf float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NOT: %r6 = call float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NOT: %r7 = call reassoc float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NOT: %r8 = call float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) +; DISCARD_LEGAL-NOT: %r9 = call reassoc float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) +; DISCARD_LEGAL: ret void ; Convert %evl into %mask everywhere (%evl Convert, %mask Legal) ;