Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -186,6 +186,12 @@ "Has VGPR mode register indexing" >; +def FeatureScalarStores : SubtargetFeature<"scalar-stores", + "HasScalarStores", + "true", + "Has store scalar memory instructions" +>; + //===------------------------------------------------------------===// // Subtarget Features (options and debugging) //===------------------------------------------------------------===// @@ -327,7 +333,7 @@ FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, FeatureGCN3Encoding, FeatureCIInsts, FeatureMovrel, Feature16BitInsts, FeatureInv2PiInlineImm, FeatureSMemRealTime, - FeatureVGPRIndexMode + FeatureVGPRIndexMode, FeatureScalarStores ] >; Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -102,6 +102,7 @@ bool HasMovrel; bool HasVGPRIndexMode; bool HasInv2PiInlineImm; + bool HasScalarStores; bool FlatAddressSpace; bool R600ALUInst; bool CaymanISA; @@ -520,6 +521,10 @@ return HasInv2PiInlineImm; } + bool hasScalarStores() const { + return HasScalarStores; + } + bool enableSIScheduler() const { return EnableSIScheduler; } Index: lib/Target/AMDGPU/SIDefines.h =================================================================== --- lib/Target/AMDGPU/SIDefines.h +++ lib/Target/AMDGPU/SIDefines.h @@ -44,7 +44,8 @@ VOPAsmPrefer32Bit = 1 << 25, Gather4 = 1 << 26, DisableWQM = 1 << 27, - SOPK_ZEXT = 1 << 28 + SOPK_ZEXT = 1 << 28, + SCALAR_STORE = 1 << 29 }; } Index: lib/Target/AMDGPU/SIInstrFormats.td =================================================================== --- lib/Target/AMDGPU/SIInstrFormats.td +++ lib/Target/AMDGPU/SIInstrFormats.td @@ -60,6 +60,11 @@ // use it as unsigned. field bits<1> SOPKZext = 0; + // This is an s_store_dword* instruction that requires a cache flush + // on wave termination. It is necessary to distinguish from mayStore + // SMEM instructions like the cache flush ones. + field bits<1> ScalarStore = 0; + // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = VM_CNT; let TSFlags{1} = EXP_CNT; @@ -94,6 +99,7 @@ let TSFlags{26} = Gather4; let TSFlags{27} = DisableWQM; let TSFlags{28} = SOPKZext; + let TSFlags{29} = ScalarStore; let SchedRW = [Write32Bit]; Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -417,6 +417,16 @@ return get(Opcode).TSFlags & SIInstrFlags::SOPK_ZEXT; } + /// \returns true if this is an s_store_dword* instruction. This is more + /// specific than than isSMEM && mayStore. + static bool isScalarStore(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SCALAR_STORE; + } + + bool isScalarStore(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SCALAR_STORE; + } + bool isVGPRCopy(const MachineInstr &MI) const { assert(MI.isCopy()); unsigned Dest = MI.getOperand(0).getReg(); Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2080,6 +2080,18 @@ } } + if (isSMRD(MI)) { + if (MI.mayStore()) { + // The register offset form of scalar stores may only use m0 as the + // soffset register. + const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff); + if (Soff && Soff->getReg() != AMDGPU::M0) { + ErrInfo = "scalar stores must use m0 as offset register"; + return false; + } + } + } + return true; } Index: lib/Target/AMDGPU/SMInstructions.td =================================================================== --- lib/Target/AMDGPU/SMInstructions.td +++ lib/Target/AMDGPU/SMInstructions.td @@ -37,6 +37,7 @@ bits<1> has_sbase = 1; bits<1> has_sdst = 1; + bit has_glc = 0; bits<1> has_offset = 1; bits<1> offset_is_imm = 0; } @@ -55,12 +56,25 @@ bits<7> sbase; bits<7> sdst; bits<32> offset; - bits<1> imm = !if(ps.has_offset, ps.offset_is_imm, 0); + bits<1> imm = !if(ps.has_offset, ps.offset_is_imm, 0); } class SM_Load_Pseudo pattern=[]> : SM_Pseudo { RegisterClass BaseClass; + let mayLoad = 1; + let mayStore = 0; + let has_glc = 1; +} + +class SM_Store_Pseudo pattern = []> + : SM_Pseudo { + RegisterClass BaseClass; + RegisterClass SrcClass; + let mayLoad = 0; + let mayStore = 1; + let has_glc = 1; + let ScalarStore = 1; } multiclass SM_Pseudo_Loads { def _IMM : SM_Load_Pseudo { + (ins baseClass:$sbase, i32imm:$offset, i1imm:$glc), + " $sdst, $sbase, $offset$glc", []> { let offset_is_imm = 1; let BaseClass = baseClass; let PseudoInstr = opName # "_IMM"; + let has_glc = 1; } + def _SGPR : SM_Load_Pseudo { + (ins baseClass:$sbase, SReg_32:$soff, i1imm:$glc), + " $sdst, $sbase, $offset$glc", []> { + let BaseClass = baseClass; + let PseudoInstr = opName # "_SGPR"; + let has_glc = 1; + } +} + +multiclass SM_Pseudo_Stores { + def _IMM : SM_Store_Pseudo { + let offset_is_imm = 1; + let BaseClass = baseClass; + let SrcClass = srcClass; + let PseudoInstr = opName # "_IMM"; + } + + def _SGPR : SM_Store_Pseudo { let BaseClass = baseClass; + let SrcClass = srcClass; let PseudoInstr = opName # "_SGPR"; } } @@ -139,6 +177,23 @@ "s_buffer_load_dwordx16", SReg_128, SReg_512 >; +defm S_STORE_DWORD : SM_Pseudo_Stores <"s_store_dword", SReg_64, SReg_32_XM0>; +defm S_STORE_DWORDX2 : SM_Pseudo_Stores <"s_store_dwordx2", SReg_64, SReg_64>; +defm S_STORE_DWORDX4 : SM_Pseudo_Stores <"s_store_dwordx4", SReg_64, SReg_128>; + +defm S_BUFFER_STORE_DWORD : SM_Pseudo_Stores < + "s_buffer_store_dword", SReg_128, SReg_32_XM0 +>; + +defm S_BUFFER_STORE_DWORDX2 : SM_Pseudo_Stores < + "s_buffer_store_dwordx2", SReg_128, SReg_64 +>; + +defm S_BUFFER_STORE_DWORDX4 : SM_Pseudo_Stores < + "s_buffer_store_dwordx4", SReg_128, SReg_128 +>; + + def S_MEMTIME : SM_Time_Pseudo <"s_memtime", int_amdgcn_s_memtime>; def S_DCACHE_INV : SM_Inval_Pseudo <"s_dcache_inv", int_amdgcn_s_dcache_inv>; @@ -179,13 +234,13 @@ // 1. IMM offset def : Pat < (smrd_load (SMRDImm i64:$sbase, i32:$offset)), - (vt (!cast(Instr#"_IMM") $sbase, $offset)) + (vt (!cast(Instr#"_IMM") $sbase, $offset, 0)) >; // 2. SGPR offset def : Pat < (smrd_load (SMRDSgpr i64:$sbase, i32:$offset)), - (vt (!cast(Instr#"_SGPR") $sbase, $offset)) + (vt (!cast(Instr#"_SGPR") $sbase, $offset, 0)) >; } @@ -210,13 +265,13 @@ // 1. Offset as an immediate def SM_LOAD_PATTERN : Pat < // name this pattern to reuse AddedComplexity on CI (SIload_constant v4i32:$sbase, (SMRDBufferImm i32:$offset)), - (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset) + (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset, 0) >; // 2. Offset loaded in an 32bit SGPR def : Pat < (SIload_constant v4i32:$sbase, (SMRDBufferSgpr i32:$offset)), - (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset) + (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset, 0) >; } // End let AddedComplexity = 100 @@ -228,7 +283,7 @@ // 1. Offset as 20bit DWORD immediate def : Pat < (SIload_constant v4i32:$sbase, IMM20bit:$offset), - (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset)) + (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset), 0) >; def : Pat < @@ -263,15 +318,22 @@ let Inst{31-27} = 0x18; //encoding } +// FIXME: Assembler should reject trying to use glc on SMRD +// instructions on SI. multiclass SM_Real_Loads_si op, string ps, SM_Load_Pseudo immPs = !cast(ps#_IMM), SM_Load_Pseudo sgprPs = !cast(ps#_SGPR)> { + def _IMM_si : SMRD_Real_si { - let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset:$offset); + let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset:$offset, GLC:$glc); } + + // FIXME: The operand name $offset is inconsistent with $soff used + // in the pseudo def _SGPR_si : SMRD_Real_si { - let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset); + let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc); } + } defm S_LOAD_DWORD : SM_Real_Loads_si <0x00, "S_LOAD_DWORD">; @@ -297,6 +359,7 @@ : SM_Real , SIMCInstr , Enc64 { + bit glc; let AssemblerPredicates = [isVI]; let DecoderNamespace = "VI"; @@ -304,10 +367,8 @@ let Inst{5-0} = !if(ps.has_sbase, sbase{6-1}, ?); let Inst{12-6} = !if(ps.has_sdst, sdst{6-0}, ?); - // glc is only applicable to scalar stores, which are not yet - // implemented. - let Inst{16} = 0; // glc bit - let Inst{17} = imm; + let Inst{16} = !if(ps.has_glc, glc, ?); + let Inst{17} = imm; let Inst{25-18} = op; let Inst{31-26} = 0x30; //encoding let Inst{51-32} = !if(ps.has_offset, offset{19-0}, ?); @@ -317,10 +378,19 @@ SM_Load_Pseudo immPs = !cast(ps#_IMM), SM_Load_Pseudo sgprPs = !cast(ps#_SGPR)> { def _IMM_vi : SMEM_Real_vi { - let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset:$offset); + let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset:$offset, GLC:$glc); + } + def _SGPR_vi : SMEM_Real_vi { + let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc); } +} + +multiclass SM_Real_Stores_vi op, string ps, + SM_Store_Pseudo sgprPs = !cast(ps#_SGPR)> { + // FIXME: The operand name $offset is inconsistent with $soff used + // in the pseudo def _SGPR_vi : SMEM_Real_vi { - let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset); + let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc); } } @@ -335,6 +405,14 @@ defm S_BUFFER_LOAD_DWORDX8 : SM_Real_Loads_vi <0x0b, "S_BUFFER_LOAD_DWORDX8">; defm S_BUFFER_LOAD_DWORDX16 : SM_Real_Loads_vi <0x0c, "S_BUFFER_LOAD_DWORDX16">; +defm S_STORE_DWORD : SM_Real_Stores_vi <0x10, "S_STORE_DWORD">; +defm S_STORE_DWORDX2 : SM_Real_Stores_vi <0x11, "S_STORE_DWORDX2">; +defm S_STORE_DWORDX4 : SM_Real_Stores_vi <0x12, "S_STORE_DWORDX4">; + +defm S_BUFFER_STORE_DWORD : SM_Real_Stores_vi <0x18, "S_BUFFER_STORE_DWORD">; +defm S_BUFFER_STORE_DWORDX2 : SM_Real_Stores_vi <0x19, "S_BUFFER_STORE_DWORDX2">; +defm S_BUFFER_STORE_DWORDX4 : SM_Real_Stores_vi <0x1a, "S_BUFFER_STORE_DWORDX4">; + def S_DCACHE_INV_vi : SMEM_Real_vi <0x20, S_DCACHE_INV>; def S_DCACHE_WB_vi : SMEM_Real_vi <0x21, S_DCACHE_WB>; def S_DCACHE_INV_VOL_vi : SMEM_Real_vi <0x22, S_DCACHE_INV_VOL>; @@ -358,7 +436,7 @@ let AssemblerPredicates = [isCIOnly]; let DecoderNamespace = "CI"; - let InOperandList = (ins ps.BaseClass:$sbase, smrd_literal_offset:$offset); + let InOperandList = (ins ps.BaseClass:$sbase, smrd_literal_offset:$offset, GLC:$glc); let LGKM_CNT = ps.LGKM_CNT; let SMRD = ps.SMRD; @@ -410,7 +488,7 @@ class SMRD_Pattern_ci : Pat < (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)), - (vt (!cast(Instr#"_IMM_ci") $sbase, $offset))> { + (vt (!cast(Instr#"_IMM_ci") $sbase, $offset, 0))> { let Predicates = [isCIOnly]; } @@ -422,7 +500,7 @@ def : Pat < (SIload_constant v4i32:$sbase, (SMRDBufferImm32 i32:$offset)), - (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset)> { + (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset, 0)> { let Predicates = [isCI]; // should this be isCIOnly? } Index: test/CodeGen/AMDGPU/coalescer-subreg-join.mir =================================================================== --- test/CodeGen/AMDGPU/coalescer-subreg-join.mir +++ test/CodeGen/AMDGPU/coalescer-subreg-join.mir @@ -46,10 +46,10 @@ %0 = COPY %sgpr2_sgpr3 %1 = COPY %vgpr2 %2 = COPY %vgpr3 - %3 = S_LOAD_DWORDX8_IMM %0, 0 - %4 = S_LOAD_DWORDX4_IMM %0, 12 - %5 = S_LOAD_DWORDX8_IMM %0, 16 - %6 = S_LOAD_DWORDX4_IMM %0, 28 + %3 = S_LOAD_DWORDX8_IMM %0, 0, 0 + %4 = S_LOAD_DWORDX4_IMM %0, 12, 0 + %5 = S_LOAD_DWORDX8_IMM %0, 16, 0 + %6 = S_LOAD_DWORDX4_IMM %0, 28, 0 undef %7.sub0 = S_MOV_B32 212739 %20 = COPY %7 %11 = COPY %20 Index: test/CodeGen/MIR/AMDGPU/target-index-operands.mir =================================================================== --- test/CodeGen/MIR/AMDGPU/target-index-operands.mir +++ test/CodeGen/MIR/AMDGPU/target-index-operands.mir @@ -1,4 +1,4 @@ -# RUN: llc -march=amdgcn -mcpu=SI -run-pass none -o - %s | FileCheck %s +# RUN: llc -march=amdgcn -run-pass none -o - %s | FileCheck %s # This test verifies that the MIR parser can parse target index operands. --- | @@ -55,15 +55,15 @@ %sgpr2 = S_ADD_U32 %sgpr2, target-index(amdgpu-constdata-start), implicit-def %scc, implicit-def %scc %sgpr3 = S_ADDC_U32 %sgpr3, 0, implicit-def %scc, implicit %scc, implicit-def %scc, implicit %scc %sgpr4_sgpr5 = S_LSHR_B64 %sgpr2_sgpr3, 32, implicit-def dead %scc - %sgpr6 = S_LOAD_DWORD_IMM %sgpr0_sgpr1, 11 + %sgpr6 = S_LOAD_DWORD_IMM %sgpr0_sgpr1, 11, 0 %sgpr7 = S_ASHR_I32 %sgpr6, 31, implicit-def dead %scc %sgpr6_sgpr7 = S_LSHL_B64 %sgpr6_sgpr7, 2, implicit-def dead %scc %sgpr2 = S_ADD_U32 %sgpr2, @float_gv, implicit-def %scc %sgpr3 = S_ADDC_U32 %sgpr4, 0, implicit-def dead %scc, implicit %scc %sgpr4 = S_ADD_U32 %sgpr2, %sgpr6, implicit-def %scc %sgpr5 = S_ADDC_U32 %sgpr3, %sgpr7, implicit-def dead %scc, implicit %scc - %sgpr2 = S_LOAD_DWORD_IMM %sgpr4_sgpr5, 0 - %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 9 + %sgpr2 = S_LOAD_DWORD_IMM %sgpr4_sgpr5, 0, 0 + %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 9, 0 %sgpr7 = S_MOV_B32 61440 %sgpr6 = S_MOV_B32 -1 %vgpr0 = V_MOV_B32_e32 killed %sgpr2, implicit %exec @@ -85,15 +85,15 @@ %sgpr2 = S_ADD_U32 %sgpr2, target-index(amdgpu-constdata-start) + 1, implicit-def %scc, implicit-def %scc %sgpr3 = S_ADDC_U32 %sgpr3, 0, implicit-def %scc, implicit %scc, implicit-def %scc, implicit %scc %sgpr4_sgpr5 = S_LSHR_B64 %sgpr2_sgpr3, 32, implicit-def dead %scc - %sgpr6 = S_LOAD_DWORD_IMM %sgpr0_sgpr1, 11 + %sgpr6 = S_LOAD_DWORD_IMM %sgpr0_sgpr1, 11, 0 %sgpr7 = S_ASHR_I32 %sgpr6, 31, implicit-def dead %scc %sgpr6_sgpr7 = S_LSHL_B64 %sgpr6_sgpr7, 2, implicit-def dead %scc %sgpr2 = S_ADD_U32 %sgpr2, @float_gv, implicit-def %scc %sgpr3 = S_ADDC_U32 %sgpr4, 0, implicit-def dead %scc, implicit %scc %sgpr4 = S_ADD_U32 %sgpr2, %sgpr6, implicit-def %scc %sgpr5 = S_ADDC_U32 %sgpr3, %sgpr7, implicit-def dead %scc, implicit %scc - %sgpr2 = S_LOAD_DWORD_IMM %sgpr4_sgpr5, 0 - %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 9 + %sgpr2 = S_LOAD_DWORD_IMM %sgpr4_sgpr5, 0, 0 + %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 9, 0 %sgpr7 = S_MOV_B32 61440 %sgpr6 = S_MOV_B32 -1 %vgpr0 = V_MOV_B32_e32 killed %sgpr2, implicit %exec Index: test/MC/AMDGPU/smem.s =================================================================== --- test/MC/AMDGPU/smem.s +++ test/MC/AMDGPU/smem.s @@ -13,3 +13,27 @@ s_memrealtime s[4:5] // VI: s_memrealtime s[4:5] ; encoding: [0x00,0x01,0x94,0xc0,0x00,0x00,0x00,0x00] // NOSI: error: instruction not supported on this GPU + +// FIXME: Should error about instruction on GPU +s_store_dword s1, s[2:3], 0xfc +// VI: s_store_dword s1, s[2:3], 0xfc ; encoding: [0x41,0x00,0x42,0xc0,0xfc,0x00,0x00,0x00] +// NOSI: error: instruction not supported on this GPU + +s_store_dword s1, s[2:3], 0xfc glc +// VI: s_store_dword s1, s[2:3], 0xfc glc ; encoding: [0x41,0x00,0x43,0xc0,0xfc,0x00,0x00,0x00] +// NOSI: error: invalid operand for instruction + +s_store_dword s1, s[2:3], s4 +// VI: s_store_dword s1, s[2:3], s4 ; encoding: [0x41,0x00,0x40,0xc0,0x04,0x00,0x00,0x00] +// NOSI: error: instruction not supported on this GPU + +s_store_dword s1, s[2:3], s4 glc +// VI: s_store_dword s1, s[2:3], s4 glc ; encoding: [0x41,0x00,0x41,0xc0,0x04,0x00,0x00,0x00] +// NOSI: error: invalid operand for instruction + +// FIXME: Should error on SI instead of silently ignoring glc +s_load_dword s1, s[2:3], 0xfc glc +// VI: s_load_dword s1, s[2:3], 0xfc glc ; encoding: [0x41,0x00,0x03,0xc0,0xfc,0x00,0x00,0x00] + +s_load_dword s1, s[2:3], s4 glc +// VI: s_load_dword s1, s[2:3], s4 glc ; encoding: [0x41,0x00,0x01,0xc0,0x04,0x00,0x00,0x00] Index: test/MC/AMDGPU/smrd-err.s =================================================================== --- test/MC/AMDGPU/smrd-err.s +++ test/MC/AMDGPU/smrd-err.s @@ -1,4 +1,4 @@ -// RUN: llvm-mc -arch=amdgcn -mcpu=tahiti %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=tahiti %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=SI %s // RUN: not llvm-mc -arch=amdgcn -mcpu=tonga %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=VI %s s_load_dwordx4 s[100:103], s[2:3], s4