Index: llvm/include/llvm/IR/IntrinsicsNVVM.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsNVVM.td +++ llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -3716,41 +3716,41 @@ // shfl.down.b32 dest, val, offset, mask_and_clamp def int_nvvm_shfl_down_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.down.i32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.down.i32">, GCCBuiltin<"__nvvm_shfl_down_i32">; def int_nvvm_shfl_down_f32 : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.down.f32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.down.f32">, GCCBuiltin<"__nvvm_shfl_down_f32">; // shfl.up.b32 dest, val, offset, mask_and_clamp def int_nvvm_shfl_up_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.up.i32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.up.i32">, GCCBuiltin<"__nvvm_shfl_up_i32">; def int_nvvm_shfl_up_f32 : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.up.f32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.up.f32">, GCCBuiltin<"__nvvm_shfl_up_f32">; // shfl.bfly.b32 dest, val, offset, mask_and_clamp def int_nvvm_shfl_bfly_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.bfly.i32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.bfly.i32">, GCCBuiltin<"__nvvm_shfl_bfly_i32">; def int_nvvm_shfl_bfly_f32 : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.bfly.f32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.bfly.f32">, GCCBuiltin<"__nvvm_shfl_bfly_f32">; // shfl.idx.b32 dest, val, lane, mask_and_clamp def int_nvvm_shfl_idx_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.idx.i32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.idx.i32">, GCCBuiltin<"__nvvm_shfl_idx_i32">; def int_nvvm_shfl_idx_f32 : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.idx.f32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.idx.f32">, GCCBuiltin<"__nvvm_shfl_idx_f32">; // Synchronizing shfl variants available in CUDA-9. @@ -3760,41 +3760,41 @@ // shfl.sync.down.b32 dest, threadmask, val, offset , mask_and_clamp def int_nvvm_shfl_sync_down_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.down.i32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.down.i32">, GCCBuiltin<"__nvvm_shfl_sync_down_i32">; def int_nvvm_shfl_sync_down_f32 : Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.down.f32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.down.f32">, GCCBuiltin<"__nvvm_shfl_sync_down_f32">; // shfl.sync.up.b32 dest, threadmask, val, offset, mask_and_clamp def int_nvvm_shfl_sync_up_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.up.i32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.up.i32">, GCCBuiltin<"__nvvm_shfl_sync_up_i32">; def int_nvvm_shfl_sync_up_f32 : Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.up.f32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.up.f32">, GCCBuiltin<"__nvvm_shfl_sync_up_f32">; // shfl.sync.bfly.b32 dest, threadmask, val, offset, mask_and_clamp def int_nvvm_shfl_sync_bfly_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.bfly.i32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.bfly.i32">, GCCBuiltin<"__nvvm_shfl_sync_bfly_i32">; def int_nvvm_shfl_sync_bfly_f32 : Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.bfly.f32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.bfly.f32">, GCCBuiltin<"__nvvm_shfl_sync_bfly_f32">; // shfl.sync.idx.b32 dest, threadmask, val, lane, mask_and_clamp def int_nvvm_shfl_sync_idx_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.idx.i32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.idx.i32">, GCCBuiltin<"__nvvm_shfl_sync_idx_i32">; def int_nvvm_shfl_sync_idx_f32 : Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.idx.f32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.idx.f32">, GCCBuiltin<"__nvvm_shfl_sync_idx_f32">; // @@ -3804,22 +3804,22 @@ // vote.all pred def int_nvvm_vote_all : Intrinsic<[llvm_i1_ty], [llvm_i1_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.vote.all">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.all">, GCCBuiltin<"__nvvm_vote_all">; // vote.any pred def int_nvvm_vote_any : Intrinsic<[llvm_i1_ty], [llvm_i1_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.vote.any">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.any">, GCCBuiltin<"__nvvm_vote_any">; // vote.uni pred def int_nvvm_vote_uni : Intrinsic<[llvm_i1_ty], [llvm_i1_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.vote.uni">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.uni">, GCCBuiltin<"__nvvm_vote_uni">; // vote.ballot pred def int_nvvm_vote_ballot : Intrinsic<[llvm_i32_ty], [llvm_i1_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.vote.ballot">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.ballot">, GCCBuiltin<"__nvvm_vote_ballot">; // @@ -3829,22 +3829,22 @@ // vote.sync.all mask, pred def int_nvvm_vote_all_sync : Intrinsic<[llvm_i1_ty], [llvm_i32_ty, llvm_i1_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.vote.all.sync">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.all.sync">, GCCBuiltin<"__nvvm_vote_all_sync">; // vote.sync.any mask, pred def int_nvvm_vote_any_sync : Intrinsic<[llvm_i1_ty], [llvm_i32_ty, llvm_i1_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.vote.any.sync">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.any.sync">, GCCBuiltin<"__nvvm_vote_any_sync">; // vote.sync.uni mask, pred def int_nvvm_vote_uni_sync : Intrinsic<[llvm_i1_ty], [llvm_i32_ty, llvm_i1_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.vote.uni.sync">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.uni.sync">, GCCBuiltin<"__nvvm_vote_uni_sync">; // vote.sync.ballot mask, pred def int_nvvm_vote_ballot_sync : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i1_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.vote.ballot.sync">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.ballot.sync">, GCCBuiltin<"__nvvm_vote_ballot_sync">; // @@ -3853,12 +3853,12 @@ // match.any.sync.b32 mask, value def int_nvvm_match_any_sync_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.match.any.sync.i32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.match.any.sync.i32">, GCCBuiltin<"__nvvm_match_any_sync_i32">; // match.any.sync.b64 mask, value def int_nvvm_match_any_sync_i64 : Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i64_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.match.any.sync.i64">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.match.any.sync.i64">, GCCBuiltin<"__nvvm_match_any_sync_i64">; // match.all instruction have two variants -- one returns a single value, another @@ -3868,11 +3868,11 @@ // match.all.sync.b32p mask, value def int_nvvm_match_all_sync_i32p : Intrinsic<[llvm_i32_ty, llvm_i1_ty], [llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.match.all.sync.i32p">; + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.match.all.sync.i32p">; // match.all.sync.b64p mask, value def int_nvvm_match_all_sync_i64p : Intrinsic<[llvm_i64_ty, llvm_i1_ty], [llvm_i32_ty, llvm_i64_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.match.all.sync.i64p">; + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.match.all.sync.i64p">; // // WMMA instructions Index: llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -811,6 +811,10 @@ switch (IID) { default: return false; + case Intrinsic::nvvm_match_all_sync_i32p: + case Intrinsic::nvvm_match_all_sync_i64p: + SelectMatchAll(N); + return true; case Intrinsic::nvvm_ldg_global_f: case Intrinsic::nvvm_ldg_global_i: case Intrinsic::nvvm_ldg_global_p: @@ -1025,10 +1029,6 @@ case Intrinsic::nvvm_texsurf_handle_internal: SelectTexSurfHandle(N); return true; - case Intrinsic::nvvm_match_all_sync_i32p: - case Intrinsic::nvvm_match_all_sync_i64p: - SelectMatchAll(N); - return true; case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f16: case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f16_satfinite: case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f32: @@ -1075,12 +1075,13 @@ void NVPTXDAGToDAGISel::SelectMatchAll(SDNode *N) { SDLoc DL(N); + SDValue Chain = N->getOperand(0); enum { IS_I64 = 4, HAS_CONST_VALUE = 2, HAS_CONST_MASK = 1 }; - unsigned IID = cast(N->getOperand(0))->getZExtValue(); + unsigned IID = cast(N->getOperand(1))->getZExtValue(); unsigned OpcodeIndex = (IID == Intrinsic::nvvm_match_all_sync_i64p) ? IS_I64 : 0; - SDValue MaskOp = N->getOperand(1); - SDValue ValueOp = N->getOperand(2); + SDValue MaskOp = N->getOperand(2); + SDValue ValueOp = N->getOperand(3); if (ConstantSDNode *ValueConst = dyn_cast(ValueOp)) { OpcodeIndex |= HAS_CONST_VALUE; ValueOp = CurDAG->getTargetConstant(ValueConst->getZExtValue(), DL, @@ -1097,9 +1098,9 @@ NVPTX::MATCH_ALLP_SYNC_32ir, NVPTX::MATCH_ALLP_SYNC_32ii, NVPTX::MATCH_ALLP_SYNC_64rr, NVPTX::MATCH_ALLP_SYNC_64ri, NVPTX::MATCH_ALLP_SYNC_64ir, NVPTX::MATCH_ALLP_SYNC_64ii}; - SDNode *NewNode = CurDAG->getMachineNode(Opcodes[OpcodeIndex], DL, - {ValueOp->getValueType(0), MVT::i1}, - {MaskOp, ValueOp}); + SDNode *NewNode = CurDAG->getMachineNode( + Opcodes[OpcodeIndex], DL, {ValueOp->getValueType(0), MVT::i1, MVT::Other}, + {MaskOp, ValueOp}); ReplaceNode(N, NewNode); } Index: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp =================================================================== --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -3321,6 +3321,16 @@ switch (Intrinsic) { default: return false; + case Intrinsic::nvvm_match_all_sync_i32p: + case Intrinsic::nvvm_match_all_sync_i64p: + Info.opc = ISD::INTRINSIC_W_CHAIN; + // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute + // in order to model data exchange with other threads, but perform no real + // memory accesses. + Info.memVT = MVT::i1; + Info.readMem = true; // Our result depends on other thread's arguments. + Info.writeMem = true; // Other threads depend on our thread's argument. + return true; case Intrinsic::nvvm_wmma_load_a_f16_col: case Intrinsic::nvvm_wmma_load_a_f16_row: case Intrinsic::nvvm_wmma_load_a_f16_col_stride: