Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -280,7 +280,9 @@ MUL_LOHI_I24, MUL_LOHI_U24, TEXTURE_FETCH, - EXPORT, + EXPORT, // exp on SI+ + EXPORT_DONE, // exp on SI+ with done bit set + R600_EXPORT, CONST_ADDRESS, REGISTER_LOAD, REGISTER_STORE, Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2998,6 +2998,8 @@ NODE_NAME_CASE(MAD_I24) NODE_NAME_CASE(TEXTURE_FETCH) NODE_NAME_CASE(EXPORT) + NODE_NAME_CASE(EXPORT_DONE) + NODE_NAME_CASE(R600_EXPORT) NODE_NAME_CASE(CONST_ADDRESS) NODE_NAME_CASE(REGISTER_LOAD) NODE_NAME_CASE(REGISTER_STORE) Index: lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -265,9 +265,35 @@ SDTypeProfile<1, 4, [SDTCisFP<0>]>, [SDNPInGlue]>; + def AMDGPUkill : SDNode<"AMDGPUISD::KILL", AMDGPUKillSDT, [SDNPHasChain, SDNPSideEffect]>; +// SI+ export +def AMDGPUExportOp : SDTypeProfile<0, 8, [ + SDTCisInt<0>, // i8 en + SDTCisInt<1>, // i1 vm + // skip done + SDTCisInt<2>, // i8 tgt + SDTCisSameAs<3, 1>, // i1 compr + SDTCisFP<4>, // f32 src0 + SDTCisSameAs<5, 4>, // f32 src1 + SDTCisSameAs<6, 4>, // f32 src2 + SDTCisSameAs<7, 4> // f32 src3 +]>; + +def AMDGPUexport: SDNode<"AMDGPUISD::EXPORT", AMDGPUExportOp, + [SDNPHasChain, SDNPMayStore]>; + +def AMDGPUexport_done: SDNode<"AMDGPUISD::EXPORT_DONE", AMDGPUExportOp, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>; + + +def R600ExportOp : SDTypeProfile<0, 7, [SDTCisFP<0>, SDTCisInt<1>]>; + +def R600_EXPORT: SDNode<"AMDGPUISD::R600_EXPORT", R600ExportOp, + [SDNPHasChain, SDNPSideEffect]>; + //===----------------------------------------------------------------------===// // Flow Control Profile Types //===----------------------------------------------------------------------===// Index: lib/Target/AMDGPU/R600ISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/R600ISelLowering.cpp +++ lib/Target/AMDGPU/R600ISelLowering.cpp @@ -443,7 +443,7 @@ DAG.getConstant(2, DL, MVT::i32), // SWZ_Z DAG.getConstant(3, DL, MVT::i32) // SWZ_W }; - return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args); + return DAG.getNode(AMDGPUISD::R600_EXPORT, DL, Op.getValueType(), Args); } // default for switch(IntrinsicID) @@ -1882,7 +1882,7 @@ return SDValue(); } - case AMDGPUISD::EXPORT: { + case AMDGPUISD::R600_EXPORT: { SDValue Arg = N->getOperand(1); if (Arg.getOpcode() != ISD::BUILD_VECTOR) break; @@ -1898,7 +1898,7 @@ N->getOperand(7) // SWZ_W }; NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL); - return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs); + return DAG.getNode(AMDGPUISD::R600_EXPORT, DL, N->getVTList(), NewArgs); } case AMDGPUISD::TEXTURE_FETCH: { SDValue Arg = N->getOperand(1); Index: lib/Target/AMDGPU/R600Instructions.td =================================================================== --- lib/Target/AMDGPU/R600Instructions.td +++ lib/Target/AMDGPU/R600Instructions.td @@ -436,11 +436,6 @@ // Export Instructions //===----------------------------------------------------------------------===// -def ExportType : SDTypeProfile<0, 7, [SDTCisFP<0>, SDTCisInt<1>]>; - -def EXPORT: SDNode<"AMDGPUISD::EXPORT", ExportType, - [SDNPHasChain, SDNPSideEffect]>; - class ExportWord0 { field bits<32> Word0; @@ -486,7 +481,7 @@ } multiclass ExportPattern cf_inst> { - def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 imm:$base), (i32 imm:$type), + def : Pat<(R600_EXPORT (v4f32 R600_Reg128:$src), (i32 imm:$base), (i32 imm:$type), (i32 imm:$swz_x), (i32 imm:$swz_y), (i32 imm:$swz_z), (i32 imm:$swz_w)), (ExportInst R600_Reg128:$src, imm:$type, imm:$base, imm:$swz_x, imm:$swz_y, imm:$swz_z, imm:$swz_w, cf_inst, 0) Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -2672,6 +2672,29 @@ SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src); return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast); } + case AMDGPUIntrinsic::SI_export: { + const ConstantSDNode *En = cast(Op.getOperand(2)); + const ConstantSDNode *VM = cast(Op.getOperand(3)); + const ConstantSDNode *Done = cast(Op.getOperand(4)); + const ConstantSDNode *Tgt = cast(Op.getOperand(5)); + const ConstantSDNode *Compr = cast(Op.getOperand(6)); + + const SDValue Ops[] = { + Chain, + DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), + DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1), + DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), + DAG.getTargetConstant(Compr->getZExtValue(), DL, MVT::i1), + Op.getOperand(7), // src0 + Op.getOperand(8), // src1 + Op.getOperand(9), // src2 + Op.getOperand(10) // src3 + }; + + unsigned Opc = Done->isNullValue() ? + AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE; + return DAG.getNode(Opc, DL, Op->getVTList(), Ops); + } default: return SDValue(); } Index: lib/Target/AMDGPU/SIInsertSkips.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertSkips.cpp +++ lib/Target/AMDGPU/SIInsertSkips.cpp @@ -159,16 +159,15 @@ MachineBasicBlock::iterator Insert = SkipBB->begin(); // Exec mask is zero: Export to NULL target... - BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP)) - .addImm(0) + BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE)) .addImm(0x09) // V_008DFC_SQ_EXP_NULL - .addImm(0) - .addImm(1) - .addImm(1) .addReg(AMDGPU::VGPR0, RegState::Undef) .addReg(AMDGPU::VGPR0, RegState::Undef) .addReg(AMDGPU::VGPR0, RegState::Undef) - .addReg(AMDGPU::VGPR0, RegState::Undef); + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addImm(1) // vm + .addImm(0) // compr + .addImm(0); // en // ... and terminate wavefront. BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); Index: lib/Target/AMDGPU/SIInsertWaits.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertWaits.cpp +++ lib/Target/AMDGPU/SIInsertWaits.cpp @@ -195,8 +195,7 @@ Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT); // Only consider stores or EXP for EXP_CNT - Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT && - (MI.getOpcode() == AMDGPU::EXP || MI.getDesc().mayStore())); + Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT) && MI.mayStore(); // LGKM may uses larger values if (TSFlags & SIInstrFlags::LGKM_CNT) { @@ -340,7 +339,8 @@ // Remember which export instructions we have seen if (Increment.Named.EXP) { - ExpInstrTypesSeen |= I->getOpcode() == AMDGPU::EXP ? 1 : 2; + ExpInstrTypesSeen |= + (I->getOpcode() == AMDGPU::EXP || I->getOpcode() == AMDGPU::EXP_DONE) ? 1 : 2; } for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { Index: lib/Target/AMDGPU/SIInstrFormats.td =================================================================== --- lib/Target/AMDGPU/SIInstrFormats.td +++ lib/Target/AMDGPU/SIInstrFormats.td @@ -232,6 +232,16 @@ let hasSideEffects = 0; } +class EXPCommon pattern> : + InstSI { + let EXP_CNT = 1; + let mayLoad = 0; // Set to 1 if done bit is set. + let mayStore = 1; + let UseNamedOperandTable = 1; + let Uses = [EXEC]; + let SchedRW = [WriteExport]; +} + } // End Uses = [EXEC] class MIMG pattern> : Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -518,32 +518,39 @@ // EXP classes //===----------------------------------------------------------------------===// -class EXPCommon : InstSI< +class EXP_Helper : EXPCommon< (outs), - (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm, - VGPR_32:$src0, VGPR_32:$src1, VGPR_32:$src2, VGPR_32:$src3), - "exp $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3", - [] > { - - let EXP_CNT = 1; - let Uses = [EXEC]; - let SchedRW = [WriteExport]; -} - -multiclass EXP_m { - - let isPseudo = 1, isCodeGenOnly = 1 in { - def "" : EXPCommon, SIMCInstr <"exp", SIEncodingFamily.NONE> ; - } - - def _si : EXPCommon, SIMCInstr <"exp", SIEncodingFamily.SI>, EXPe { - let DecoderNamespace="SICI"; - let DisableDecoder = DisableSIDecoder; - } + (ins i8imm:$tgt, VGPR_32:$src0, VGPR_32:$src1, VGPR_32:$src2, VGPR_32:$src3, + i1imm:$vm, i1imm:$compr, i8imm:$en), + "exp $en, $tgt, $compr, "#!if(done, "1", "0")#", $vm, $src0, $src1, $src2, $src3", + [(node (i8 timm:$en), (i1 timm:$vm), (i8 timm:$tgt), (i1 timm:$compr), + f32:$src0, f32:$src1, f32:$src2, f32:$src3)] +>; - def _vi : EXPCommon, SIMCInstr <"exp", SIEncodingFamily.VI>, EXPe_vi { - let DecoderNamespace="VI"; - let DisableDecoder = DisableVIDecoder; +// Split EXP instruction into EXP and EXP_DONE so we can set +// mayLoad for done=1. +multiclass EXP_m { + let mayLoad = done in { + let isPseudo = 1, isCodeGenOnly = 1 in { + def "" : EXP_Helper, + SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.NONE>; + } + + let done = done in { + def _si : EXP_Helper, + SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.SI>, + EXPe { + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; + } + + def _vi : EXP_Helper, + SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.VI>, + EXPe_vi { + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; + } + } } } Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -37,7 +37,8 @@ // EXP Instructions //===----------------------------------------------------------------------===// -defm EXP : EXP_m; +defm EXP : EXP_m<0, AMDGPUexport>; +defm EXP_DONE : EXP_m<1, AMDGPUexport_done>; //===----------------------------------------------------------------------===// // VINTRP Instructions @@ -388,13 +389,6 @@ (SI_KILL (i32 0xbf800000)) >; -def : Pat < - (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr, - f32:$src0, f32:$src1, f32:$src2, f32:$src3), - (EXP imm:$en, imm:$tgt, imm:$compr, imm:$done, imm:$vm, - $src0, $src1, $src2, $src3) ->; - //===----------------------------------------------------------------------===// // VOP1 Patterns //===----------------------------------------------------------------------===// Index: lib/Target/AMDGPU/SIIntrinsics.td =================================================================== --- lib/Target/AMDGPU/SIIntrinsics.td +++ lib/Target/AMDGPU/SIIntrinsics.td @@ -15,7 +15,20 @@ let TargetPrefix = "SI", isTarget = 1 in { def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; + + def int_SI_export : Intrinsic <[], + [llvm_i32_ty, // en + llvm_i32_ty, // vm (FIXME: should be i1) + llvm_i32_ty, // done (FIXME: should be i1) + llvm_i32_ty, // tgt + llvm_i32_ty, // compr (FIXME: should be i1) + llvm_float_ty, // src0 + llvm_float_ty, // src1 + llvm_float_ty, // src2 + llvm_float_ty], // src3 + [] + >; + def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_anyint_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ; Index: test/CodeGen/AMDGPU/ret.ll =================================================================== --- test/CodeGen/AMDGPU/ret.ll +++ test/CodeGen/AMDGPU/ret.ll @@ -6,7 +6,7 @@ ; GCN-LABEL: {{^}}vgpr: ; GCN: v_mov_b32_e32 v1, v0 ; GCN-DAG: v_add_f32_e32 v0, 1.0, v1 -; GCN-DAG: exp 15, 0, 1, 1, 1, v1, v1, v1, v1 +; GCN-DAG: exp 15, 0, -1, 1, -1, v1, v1, v1, v1 ; GCN: s_waitcnt expcnt(0) ; GCN-NOT: s_endpgm define amdgpu_vs {float, float} @vgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) { @@ -19,7 +19,7 @@ ; GCN-LABEL: {{^}}vgpr_literal: ; GCN: v_mov_b32_e32 v4, v0 -; GCN: exp 15, 0, 1, 1, 1, v4, v4, v4, v4 +; GCN: exp 15, 0, -1, 1, -1, v4, v4, v4, v4 ; GCN-DAG: v_mov_b32_e32 v0, 1.0 ; GCN-DAG: v_mov_b32_e32 v1, 2.0 ; GCN-DAG: v_mov_b32_e32 v2, 4.0 @@ -209,7 +209,7 @@ ; GCN-LABEL: {{^}}both: ; GCN: v_mov_b32_e32 v1, v0 -; GCN-DAG: exp 15, 0, 1, 1, 1, v1, v1, v1, v1 +; GCN-DAG: exp 15, 0, -1, 1, -1, v1, v1, v1, v1 ; GCN-DAG: v_add_f32_e32 v0, 1.0, v1 ; GCN-DAG: s_add_i32 s0, s3, 2 ; GCN-DAG: s_mov_b32 s1, s2 @@ -231,7 +231,7 @@ ; GCN-LABEL: {{^}}structure_literal: ; GCN: v_mov_b32_e32 v3, v0 -; GCN: exp 15, 0, 1, 1, 1, v3, v3, v3, v3 +; GCN: exp 15, 0, -1, 1, -1, v3, v3, v3, v3 ; GCN-DAG: v_mov_b32_e32 v0, 1.0 ; GCN-DAG: s_mov_b32 s0, 2 ; GCN-DAG: s_mov_b32 s1, 3 Index: test/CodeGen/MIR/AMDGPU/movrels-bug.mir =================================================================== --- test/CodeGen/MIR/AMDGPU/movrels-bug.mir +++ test/CodeGen/MIR/AMDGPU/movrels-bug.mir @@ -25,7 +25,7 @@ %m0 = S_MOV_B32 undef %sgpr0 %vgpr1 = V_MOVRELS_B32_e32 undef %vgpr1, implicit %m0, implicit %exec, implicit killed %vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 %vgpr4 = V_MAC_F32_e32 undef %vgpr0, undef %vgpr0, undef %vgpr4, implicit %exec - EXP 15, 12, 0, 1, 0, undef %vgpr0, killed %vgpr1, killed %vgpr4, undef %vgpr0, implicit %exec + EXP_DONE 15, undef %vgpr0, killed %vgpr1, killed %vgpr4, undef %vgpr0, 0, 0, 12, implicit %exec S_ENDPGM ...