Index: llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td +++ llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td @@ -815,6 +815,7 @@ def int_amdgcn_buffer_store_format : AMDGPUBufferStore; def int_amdgcn_buffer_store : AMDGPUBufferStore; +// Obsolescent tbuffer intrinsics. def int_amdgcn_tbuffer_load : Intrinsic < [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 [llvm_v4i32_ty, // rsrc(SGPR) @@ -844,6 +845,54 @@ [IntrWriteMem], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; +// New tbuffer intrinsics, with: +// - raw and struct variants +// - joint format field +// - joint cachepolicy field +def int_amdgcn_raw_tbuffer_load : Intrinsic < + [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 + [llvm_v4i32_ty, // rsrc(SGPR) + llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) + llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) + llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + [IntrReadMem], "", [SDNPMemOperand]>, + AMDGPURsrcIntrinsic<0>; + +def int_amdgcn_raw_tbuffer_store : Intrinsic < + [], + [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 + llvm_v4i32_ty, // rsrc(SGPR) + llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) + llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) + llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + [IntrWriteMem], "", [SDNPMemOperand]>, + AMDGPURsrcIntrinsic<1>; + +def int_amdgcn_struct_tbuffer_load : Intrinsic < + [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 + [llvm_v4i32_ty, // rsrc(SGPR) + llvm_i32_ty, // vindex(VGPR) + llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) + llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) + llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + [IntrReadMem], "", [SDNPMemOperand]>, + AMDGPURsrcIntrinsic<0>; + +def int_amdgcn_struct_tbuffer_store : Intrinsic < + [], + [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 + llvm_v4i32_ty, // rsrc(SGPR) + llvm_i32_ty, // vindex(VGPR) + llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) + llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) + llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + [IntrWriteMem], "", [SDNPMemOperand]>, + AMDGPURsrcIntrinsic<1>; + class AMDGPUBufferAtomic : Intrinsic < [llvm_i32_ty], [llvm_i32_ty, // vdata(VGPR) Index: llvm/trunk/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ llvm/trunk/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -161,8 +161,7 @@ ImmTyExpTgt, ImmTyExpCompr, ImmTyExpVM, - ImmTyDFMT, - ImmTyNFMT, + ImmTyFORMAT, ImmTyHwreg, ImmTyOff, ImmTySendMsg, @@ -312,8 +311,7 @@ bool isSLC() const { return isImmTy(ImmTySLC); } bool isTFE() const { return isImmTy(ImmTyTFE); } bool isD16() const { return isImmTy(ImmTyD16); } - bool isDFMT() const { return isImmTy(ImmTyDFMT) && isUInt<8>(getImm()); } - bool isNFMT() const { return isImmTy(ImmTyNFMT) && isUInt<8>(getImm()); } + bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<8>(getImm()); } bool isBankMask() const { return isImmTy(ImmTyDppBankMask); } bool isRowMask() const { return isImmTy(ImmTyDppRowMask); } bool isBoundCtrl() const { return isImmTy(ImmTyDppBoundCtrl); } @@ -666,8 +664,7 @@ case ImmTySLC: OS << "SLC"; break; case ImmTyTFE: OS << "TFE"; break; case ImmTyD16: OS << "D16"; break; - case ImmTyDFMT: OS << "DFMT"; break; - case ImmTyNFMT: OS << "NFMT"; break; + case ImmTyFORMAT: OS << "FORMAT"; break; case ImmTyClampSI: OS << "ClampSI"; break; case ImmTyOModSI: OS << "OModSI"; break; case ImmTyDppCtrl: OS << "DppCtrl"; break; @@ -1061,6 +1058,7 @@ OperandMatchResultTy parseRegWithFPInputMods(OperandVector &Operands); OperandMatchResultTy parseRegWithIntInputMods(OperandVector &Operands); OperandMatchResultTy parseVReg32OrOff(OperandVector &Operands); + OperandMatchResultTy parseDfmtNfmt(OperandVector &Operands); void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands); void cvtDS(MCInst &Inst, const OperandVector &Operands) { cvtDSImpl(Inst, Operands, false); } @@ -3522,6 +3520,53 @@ return MatchOperand_Success; } +// dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their +// values to live in a joint format operand in the MCInst encoding. +OperandMatchResultTy +AMDGPUAsmParser::parseDfmtNfmt(OperandVector &Operands) { + SMLoc S = Parser.getTok().getLoc(); + int64_t Dfmt = 0, Nfmt = 0; + // dfmt and nfmt can appear in either order, and each is optional. + bool GotDfmt = false, GotNfmt = false; + while (!GotDfmt || !GotNfmt) { + if (!GotDfmt) { + auto Res = parseIntWithPrefix("dfmt", Dfmt); + if (Res != MatchOperand_NoMatch) { + if (Res != MatchOperand_Success) + return Res; + if (Dfmt >= 16) { + Error(Parser.getTok().getLoc(), "out of range dfmt"); + return MatchOperand_ParseFail; + } + GotDfmt = true; + Parser.Lex(); + continue; + } + } + if (!GotNfmt) { + auto Res = parseIntWithPrefix("nfmt", Nfmt); + if (Res != MatchOperand_NoMatch) { + if (Res != MatchOperand_Success) + return Res; + if (Nfmt >= 8) { + Error(Parser.getTok().getLoc(), "out of range nfmt"); + return MatchOperand_ParseFail; + } + GotNfmt = true; + Parser.Lex(); + continue; + } + } + break; + } + if (!GotDfmt && !GotNfmt) + return MatchOperand_NoMatch; + auto Format = Dfmt | Nfmt << 4; + Operands.push_back( + AMDGPUOperand::CreateImm(this, Format, S, AMDGPUOperand::ImmTyFORMAT)); + return MatchOperand_Success; +} + //===----------------------------------------------------------------------===// // ds //===----------------------------------------------------------------------===// @@ -4617,8 +4662,7 @@ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDFMT); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyNFMT); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyFORMAT); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); @@ -4761,8 +4805,7 @@ {"lds", AMDGPUOperand::ImmTyLDS, true, nullptr}, {"offset", AMDGPUOperand::ImmTyOffset, false, nullptr}, {"inst_offset", AMDGPUOperand::ImmTyInstOffset, false, nullptr}, - {"dfmt", AMDGPUOperand::ImmTyDFMT, false, nullptr}, - {"nfmt", AMDGPUOperand::ImmTyNFMT, false, nullptr}, + {"dfmt", AMDGPUOperand::ImmTyFORMAT, false, nullptr}, {"glc", AMDGPUOperand::ImmTyGLC, true, nullptr}, {"slc", AMDGPUOperand::ImmTySLC, true, nullptr}, {"tfe", AMDGPUOperand::ImmTyTFE, true, nullptr}, @@ -4844,6 +4887,8 @@ Op.Type == AMDGPUOperand::ImmTyNegHi) { res = parseOperandArrayWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult); + } else if (Op.Type == AMDGPUOperand::ImmTyFORMAT) { + res = parseDfmtNfmt(Operands); } else { res = parseIntWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult); } Index: llvm/trunk/lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/BUFInstructions.td +++ llvm/trunk/lib/Target/AMDGPU/BUFInstructions.td @@ -100,15 +100,11 @@ bits<1> has_vaddr = 1; bits<1> has_glc = 1; bits<1> glc_value = 0; // the value for glc if no such operand - bits<4> dfmt_value = 1; // the value for dfmt if no such operand - bits<3> nfmt_value = 0; // the value for nfmt if no such operand bits<1> has_srsrc = 1; bits<1> has_soffset = 1; bits<1> has_offset = 1; bits<1> has_slc = 1; bits<1> has_tfe = 1; - bits<1> has_dfmt = 1; - bits<1> has_nfmt = 1; } class MTBUF_Real : @@ -126,14 +122,16 @@ bits<12> offset; bits<1> glc; - bits<4> dfmt; - bits<3> nfmt; + bits<7> format; bits<8> vaddr; bits<8> vdata; bits<7> srsrc; bits<1> slc; bits<1> tfe; bits<8> soffset; + + bits<4> dfmt = format{3-0}; + bits<3> nfmt = format{6-4}; } class getMTBUFInsDA vdataList, @@ -142,16 +140,16 @@ RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); dag InsNoData = !if(!empty(vaddrList), (ins SReg_128:$srsrc, SCSrc_b32:$soffset, - offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, SLC:$slc, TFE:$tfe), + offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe), (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, - offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, SLC:$slc, TFE:$tfe) + offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe) ); dag InsData = !if(!empty(vaddrList), (ins vdataClass:$vdata, SReg_128:$srsrc, - SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, + SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe), (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc, - SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, + SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe) ); dag ret = !if(!empty(vdataList), InsNoData, InsData); @@ -169,15 +167,15 @@ class getMTBUFAsmOps { string Pfx = - !if(!eq(addrKind, BUFAddrKind.Offset), "off, $srsrc, $dfmt, $nfmt, $soffset", + !if(!eq(addrKind, BUFAddrKind.Offset), "off, $srsrc, $format, $soffset", !if(!eq(addrKind, BUFAddrKind.OffEn), - "$vaddr, $srsrc, $dfmt, $nfmt, $soffset offen", + "$vaddr, $srsrc, $format, $soffset offen", !if(!eq(addrKind, BUFAddrKind.IdxEn), - "$vaddr, $srsrc, $dfmt, $nfmt, $soffset idxen", + "$vaddr, $srsrc, $format, $soffset idxen", !if(!eq(addrKind, BUFAddrKind.BothEn), - "$vaddr, $srsrc, $dfmt, $nfmt, $soffset idxen offen", + "$vaddr, $srsrc, $format, $soffset idxen offen", !if(!eq(addrKind, BUFAddrKind.Addr64), - "$vaddr, $srsrc, $dfmt, $nfmt, $soffset addr64", + "$vaddr, $srsrc, $format, $soffset addr64", ""))))); string ret = Pfx # "$offset"; } @@ -217,14 +215,14 @@ def _OFFSET : MTBUF_Load_Pseudo , + (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$format, + i1:$glc, i1:$slc, i1:$tfe)))]>, MTBUFAddr64Table<0, NAME>; def _ADDR64 : MTBUF_Load_Pseudo , + i8:$format, i1:$glc, i1:$slc, i1:$tfe)))]>, MTBUFAddr64Table<1, NAME>; def _OFFEN : MTBUF_Load_Pseudo ; @@ -263,13 +261,13 @@ def _OFFSET : MTBUF_Store_Pseudo , MTBUFAddr64Table<0, NAME>; def _ADDR64 : MTBUF_Store_Pseudo , MTBUFAddr64Table<1, NAME>; @@ -1030,6 +1028,14 @@ // MUBUF Patterns //===----------------------------------------------------------------------===// +def extract_glc : SDNodeXFormgetTargetConstant(N->getZExtValue() & 1, SDLoc(N), MVT::i8); +}]>; + +def extract_slc : SDNodeXFormgetTargetConstant((N->getZExtValue() >> 1) & 1, SDLoc(N), MVT::i8); +}]>; + //===----------------------------------------------------------------------===// // buffer_load/store_format patterns //===----------------------------------------------------------------------===// @@ -1524,32 +1530,36 @@ string opcode> { def : GCNPat< (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset, - imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)), + imm:$format, imm:$cachepolicy, 0)), (!cast(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset), - (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0) + (as_i8imm $format), + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) >; def : GCNPat< (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset, - imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)), + imm:$format, imm:$cachepolicy, imm)), (!cast(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), - (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0) + (as_i8imm $format), + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) >; def : GCNPat< (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset, - imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)), + imm:$format, imm:$cachepolicy, 0)), (!cast(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), - (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0) + (as_i8imm $format), + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) >; def : GCNPat< (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset, - imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)), + imm:$format, imm:$cachepolicy, imm)), (!cast(opcode # _BOTHEN) (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), - (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0) + (as_i8imm $format), + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) >; } @@ -1576,39 +1586,36 @@ string opcode> { def : GCNPat< (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset, - imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc), + imm:$format, imm:$cachepolicy, 0), (!cast(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, - (as_i16imm $offset), (as_i8imm $dfmt), - (as_i8imm $nfmt), (as_i1imm $glc), - (as_i1imm $slc), 0) + (as_i16imm $offset), (as_i8imm $format), + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) >; def : GCNPat< (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset, - imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc), + imm:$format, imm:$cachepolicy, imm), (!cast(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, - (as_i16imm $offset), (as_i8imm $dfmt), - (as_i8imm $nfmt), (as_i1imm $glc), - (as_i1imm $slc), 0) + (as_i16imm $offset), (as_i8imm $format), + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) >; def : GCNPat< (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset, - imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc), + imm:$format, imm:$cachepolicy, 0), (!cast(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, - (as_i16imm $offset), (as_i8imm $dfmt), - (as_i8imm $nfmt), (as_i1imm $glc), - (as_i1imm $slc), 0) + (as_i16imm $offset), (as_i8imm $format), + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) >; def : GCNPat< (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, - imm:$offset, imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc), + imm:$offset, imm:$format, imm:$cachepolicy, imm), (!cast(opcode # _BOTHEN_exact) $vdata, (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), - $rsrc, $soffset, (as_i16imm $offset), - (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0) + $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) >; } @@ -1781,8 +1788,8 @@ let Inst{14} = !if(ps.has_glc, glc, ps.glc_value); let Inst{15} = ps.addr64; let Inst{18-16} = op; - let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value); - let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value); + let Inst{22-19} = dfmt; + let Inst{25-23} = nfmt; let Inst{31-26} = 0x3a; //encoding let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); let Inst{47-40} = !if(ps.has_vdata, vdata, ?); @@ -1811,6 +1818,7 @@ //===----------------------------------------------------------------------===// // CI +// MTBUF - GFX6, GFX7. //===----------------------------------------------------------------------===// class MUBUF_Real_ci op, MUBUF_Pseudo ps> : @@ -2013,8 +2021,8 @@ let Inst{13} = ps.idxen; let Inst{14} = !if(ps.has_glc, glc, ps.glc_value); let Inst{18-15} = op; - let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value); - let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value); + let Inst{22-19} = dfmt; + let Inst{25-23} = nfmt; let Inst{31-26} = 0x3a; //encoding let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); let Inst{47-40} = !if(ps.has_vdata, vdata, ?); @@ -2043,8 +2051,8 @@ let Inst{13} = ps.idxen; let Inst{14} = !if(ps.has_glc, glc, ps.glc_value); let Inst{18-15} = op; - let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value); - let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value); + let Inst{22-19} = dfmt; + let Inst{25-23} = nfmt; let Inst{31-26} = 0x3a; //encoding let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); let Inst{47-40} = !if(ps.has_vdata, vdata, ?); Index: llvm/trunk/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h +++ llvm/trunk/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h @@ -90,10 +90,8 @@ const MCSubtargetInfo &STI, raw_ostream &O); void printExpVM(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); - void printDFMT(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printNFMT(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); + void printFORMAT(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printRegOperand(unsigned RegNo, raw_ostream &O); void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, Index: llvm/trunk/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ llvm/trunk/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -236,21 +236,12 @@ O << " vm"; } -void AMDGPUInstPrinter::printDFMT(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) { - O << " dfmt:"; - printU8ImmDecOperand(MI, OpNo, O); - } -} - -void AMDGPUInstPrinter::printNFMT(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) { - O << " nfmt:"; - printU8ImmDecOperand(MI, OpNo, O); +void AMDGPUInstPrinter::printFORMAT(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + if (unsigned Val = MI->getOperand(OpNo).getImm()) { + O << " dfmt:" << (Val & 15); + O << ", nfmt:" << (Val >> 4); } } Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h @@ -65,6 +65,15 @@ SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; + // The raw.tbuffer and struct.tbuffer intrinsics have two offset args: offset + // (the offset that is included in bounds checking and swizzling, to be split + // between the instruction's voffset and immoffset fields) and soffset (the + // offset that is excluded from bounds checking and swizzling, to go in the + // instruction's soffset field). This function takes the first kind of + // offset and figures out how to split it between voffset and immoffset. + std::pair splitBufferOffsets(SDValue Offset, + SelectionDAG &DAG) const; + SDValue widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5158,6 +5158,13 @@ MemSDNode *M = cast(Op); EVT LoadVT = Op.getValueType(); + unsigned Dfmt = cast(Op.getOperand(7))->getZExtValue(); + unsigned Nfmt = cast(Op.getOperand(8))->getZExtValue(); + unsigned Glc = cast(Op.getOperand(9))->getZExtValue(); + unsigned Slc = cast(Op.getOperand(10))->getZExtValue(); + unsigned IdxEn = 1; + if (auto Idx = dyn_cast(Op.getOperand(3))) + IdxEn = Idx->getZExtValue() != 0; SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // rsrc @@ -5165,10 +5172,57 @@ Op.getOperand(4), // voffset Op.getOperand(5), // soffset Op.getOperand(6), // offset - Op.getOperand(7), // dfmt - Op.getOperand(8), // nfmt - Op.getOperand(9), // glc - Op.getOperand(10) // slc + DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format + DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy + DAG.getConstant(IdxEn, DL, MVT::i1), // idxen + }; + + if (LoadVT.getScalarType() == MVT::f16) + return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, + M, DAG, Ops); + return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, + Op->getVTList(), Ops, LoadVT, + M->getMemOperand()); + } + case Intrinsic::amdgcn_raw_tbuffer_load: { + MemSDNode *M = cast(Op); + EVT LoadVT = Op.getValueType(); + auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG); + + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + Offsets.first, // voffset + Op.getOperand(4), // soffset + Offsets.second, // offset + Op.getOperand(5), // format + Op.getOperand(6), // cachepolicy + DAG.getConstant(0, DL, MVT::i1), // idxen + }; + + if (LoadVT.getScalarType() == MVT::f16) + return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, + M, DAG, Ops); + return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, + Op->getVTList(), Ops, LoadVT, + M->getMemOperand()); + } + case Intrinsic::amdgcn_struct_tbuffer_load: { + MemSDNode *M = cast(Op); + EVT LoadVT = Op.getValueType(); + auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); + + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // rsrc + Op.getOperand(3), // vindex + Offsets.first, // voffset + Op.getOperand(5), // soffset + Offsets.second, // offset + Op.getOperand(6), // format + Op.getOperand(7), // cachepolicy + DAG.getConstant(1, DL, MVT::i1), // idxen }; if (LoadVT.getScalarType() == MVT::f16) @@ -5407,6 +5461,10 @@ auto Opcode = NumChannels->getZExtValue() == 3 ? AMDGPUISD::TBUFFER_STORE_FORMAT_X3 : AMDGPUISD::TBUFFER_STORE_FORMAT; + unsigned Dfmt = cast(Op.getOperand(8))->getZExtValue(); + unsigned Nfmt = cast(Op.getOperand(9))->getZExtValue(); + unsigned Glc = cast(Op.getOperand(12))->getZExtValue(); + unsigned Slc = cast(Op.getOperand(13))->getZExtValue(); SDValue Ops[] = { Chain, Op.getOperand(3), // vdata @@ -5415,10 +5473,9 @@ VOffset, Op.getOperand(6), // soffset Op.getOperand(7), // inst_offset - Op.getOperand(8), // dfmt - Op.getOperand(9), // nfmt - Op.getOperand(12), // glc - Op.getOperand(13), // slc + DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format + DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy + DAG.getConstant(IdxEn->isOne(), DL, MVT::i1), // idxen }; assert((cast(Op.getOperand(14)))->getZExtValue() == 0 && @@ -5438,6 +5495,13 @@ bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); if (IsD16) VData = handleD16VData(VData, DAG); + unsigned Dfmt = cast(Op.getOperand(8))->getZExtValue(); + unsigned Nfmt = cast(Op.getOperand(9))->getZExtValue(); + unsigned Glc = cast(Op.getOperand(10))->getZExtValue(); + unsigned Slc = cast(Op.getOperand(11))->getZExtValue(); + unsigned IdxEn = 1; + if (auto Idx = dyn_cast(Op.getOperand(4))) + IdxEn = Idx->getZExtValue() != 0; SDValue Ops[] = { Chain, VData, // vdata @@ -5446,10 +5510,59 @@ Op.getOperand(5), // voffset Op.getOperand(6), // soffset Op.getOperand(7), // offset - Op.getOperand(8), // dfmt - Op.getOperand(9), // nfmt - Op.getOperand(10), // glc - Op.getOperand(11) // slc + DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format + DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy + DAG.getConstant(IdxEn, DL, MVT::i1), // idexen + }; + unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : + AMDGPUISD::TBUFFER_STORE_FORMAT; + MemSDNode *M = cast(Op); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, + M->getMemoryVT(), M->getMemOperand()); + } + + case Intrinsic::amdgcn_struct_tbuffer_store: { + SDValue VData = Op.getOperand(2); + bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); + if (IsD16) + VData = handleD16VData(VData, DAG); + auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); + SDValue Ops[] = { + Chain, + VData, // vdata + Op.getOperand(3), // rsrc + Op.getOperand(4), // vindex + Offsets.first, // voffset + Op.getOperand(6), // soffset + Offsets.second, // offset + Op.getOperand(7), // format + Op.getOperand(8), // cachepolicy + DAG.getConstant(1, DL, MVT::i1), // idexen + }; + unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : + AMDGPUISD::TBUFFER_STORE_FORMAT; + MemSDNode *M = cast(Op); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, + M->getMemoryVT(), M->getMemOperand()); + } + + case Intrinsic::amdgcn_raw_tbuffer_store: { + SDValue VData = Op.getOperand(2); + bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); + if (IsD16) + VData = handleD16VData(VData, DAG); + auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); + SDValue Ops[] = { + Chain, + VData, // vdata + Op.getOperand(3), // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + Offsets.first, // voffset + Op.getOperand(5), // soffset + Offsets.second, // offset + Op.getOperand(6), // format + Op.getOperand(7), // cachepolicy + DAG.getConstant(0, DL, MVT::i1), // idexen }; unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : AMDGPUISD::TBUFFER_STORE_FORMAT; @@ -5490,6 +5603,50 @@ } } +// The raw.tbuffer and struct.tbuffer intrinsics have two offset args: offset +// (the offset that is included in bounds checking and swizzling, to be split +// between the instruction's voffset and immoffset fields) and soffset (the +// offset that is excluded from bounds checking and swizzling, to go in the +// instruction's soffset field). This function takes the first kind of offset +// and figures out how to split it between voffset and immoffset. +std::pair SITargetLowering::splitBufferOffsets( + SDValue Offset, SelectionDAG &DAG) const { + SDLoc DL(Offset); + const unsigned MaxImm = 4095; + SDValue N0 = Offset; + ConstantSDNode *C1 = nullptr; + if (N0.getOpcode() == ISD::ADD) { + if ((C1 = dyn_cast(N0.getOperand(1)))) + N0 = N0.getOperand(0); + } else if ((C1 = dyn_cast(N0))) + N0 = SDValue(); + + if (C1) { + unsigned ImmOffset = C1->getZExtValue(); + // If the immediate value is too big for the immoffset field, put the value + // mod 4096 into the immoffset field so that the value that is copied/added + // for the voffset field is a multiple of 4096, and it stands more chance + // of being CSEd with the copy/add for another similar load/store. + unsigned Overflow = ImmOffset & ~MaxImm; + ImmOffset -= Overflow; + C1 = cast(DAG.getConstant(ImmOffset, DL, MVT::i32)); + if (Overflow) { + auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32); + if (!N0) + N0 = OverflowVal; + else { + SDValue Ops[] = { N0, OverflowVal }; + N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops); + } + } + } + if (!N0) + N0 = DAG.getConstant(0, DL, MVT::i32); + if (!C1) + C1 = cast(DAG.getConstant(0, DL, MVT::i32)); + return {N0, SDValue(C1, 0)}; +} + static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT) { Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td @@ -69,36 +69,34 @@ [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] >; -def SDTbuffer_load : SDTypeProfile<1, 9, +def SDTtbuffer_load : SDTypeProfile<1, 8, [ // vdata SDTCisVT<1, v4i32>, // rsrc SDTCisVT<2, i32>, // vindex(VGPR) SDTCisVT<3, i32>, // voffset(VGPR) SDTCisVT<4, i32>, // soffset(SGPR) SDTCisVT<5, i32>, // offset(imm) - SDTCisVT<6, i32>, // dfmt(imm) - SDTCisVT<7, i32>, // nfmt(imm) - SDTCisVT<8, i32>, // glc(imm) - SDTCisVT<9, i32> // slc(imm) + SDTCisVT<6, i32>, // format(imm) + SDTCisVT<7, i32>, // cachecontrol(imm) + SDTCisVT<8, i1> // idxen(imm) ]>; -def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT", SDTbuffer_load, +def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT", SDTtbuffer_load, [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>; def SItbuffer_load_d16 : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT_D16", - SDTbuffer_load, + SDTtbuffer_load, [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>; -def SDTtbuffer_store : SDTypeProfile<0, 10, +def SDTtbuffer_store : SDTypeProfile<0, 9, [ // vdata SDTCisVT<1, v4i32>, // rsrc SDTCisVT<2, i32>, // vindex(VGPR) SDTCisVT<3, i32>, // voffset(VGPR) SDTCisVT<4, i32>, // soffset(SGPR) SDTCisVT<5, i32>, // offset(imm) - SDTCisVT<6, i32>, // dfmt(imm) - SDTCisVT<7, i32>, // nfmt(imm) - SDTCisVT<8, i32>, // glc(imm) - SDTCisVT<9, i32> // slc(imm) + SDTCisVT<6, i32>, // format(imm) + SDTCisVT<7, i32>, // cachecontrol(imm) + SDTCisVT<8, i1> // idxen(imm) ]>; def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", SDTtbuffer_store, @@ -752,8 +750,7 @@ def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>; def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>; -def DFMT : NamedOperandU8<"DFMT", NamedMatchClass<"DFMT">>; -def NFMT : NamedOperandU8<"NFMT", NamedMatchClass<"NFMT">>; +def FORMAT : NamedOperandU8<"FORMAT", NamedMatchClass<"FORMAT">>; def DMask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>; Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll @@ -0,0 +1,42 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s + +; GCN-LABEL: {{^}}tbuffer_load_d16_x: +; GCN: tbuffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 +define amdgpu_ps half @tbuffer_load_d16_x(<4 x i32> inreg %rsrc) { +main_body: + %data = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 22, i32 0) + ret half %data +} + +; GCN-LABEL: {{^}}tbuffer_load_d16_xy: +; UNPACKED: tbuffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 +; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]] +define amdgpu_ps half @tbuffer_load_d16_xy(<4 x i32> inreg %rsrc) { +main_body: + %data = call <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 22, i32 0) + %elt = extractelement <2 x half> %data, i32 1 + ret half %elt +} + +; GCN-LABEL: {{^}}tbuffer_load_d16_xyzw: +; UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 +; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]] +define amdgpu_ps half @tbuffer_load_d16_xyzw(<4 x i32> inreg %rsrc) { +main_body: + %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 22, i32 0) + %elt = extractelement <4 x half> %data, i32 3 + ret half %elt +} + +declare half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32>, i32, i32, i32, i32) +declare <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32, i32) +declare <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, i32) + Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll @@ -0,0 +1,92 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}tbuffer_load: +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:3, 0 glc +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0 slc +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0 +; GCN: s_waitcnt +define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) { +main_body: + %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 78, i32 0) + %vdata_glc = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 63, i32 1) + %vdata_slc = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 22, i32 2) + %vdata_f32 = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 22, i32 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + %vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float> + %vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float> + %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %vdata.f, 0 + %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %vdata_glc.f, 1 + %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %vdata_slc.f, 2 + %r3 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r2, <4 x float> %vdata_f32, 3 + ret {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r3 +} + +; GCN-LABEL: {{^}}tbuffer_load_immoffs: +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offset:42 +define amdgpu_vs <4 x float> @tbuffer_load_immoffs(<4 x i32> inreg) { +main_body: + %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 42, i32 0, i32 78, i32 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + ret <4 x float> %vdata.f +} + +; GCN-LABEL: {{^}}tbuffer_load_immoffs_large +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:2, 61 offset:4095 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:3, {{s[0-9]+}} offset:73 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, {{s[0-9]+}} offset:1 +; GCN: s_waitcnt +define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(<4 x i32> inreg, i32 inreg %soffs) { + %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 4095, i32 61, i32 47, i32 0) + %vdata_glc = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 73, i32 %soffs, i32 62, i32 0) + %vdata_slc = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 1, i32 %soffs, i32 77, i32 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + %vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float> + %vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float> + %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %vdata.f, 0 + %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %vdata_glc.f, 1 + %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %vdata_slc.f, 2 + ret {<4 x float>, <4 x float>, <4 x float>} %r2 +} + +; GCN-LABEL: {{^}}tbuffer_load_ofs: +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen +define amdgpu_vs <4 x float> @tbuffer_load_ofs(<4 x i32> inreg, i32 %voffs) { +main_body: + %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 %voffs, i32 0, i32 78, i32 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + ret <4 x float> %vdata.f +} + +; GCN-LABEL: {{^}}tbuffer_load_ofs_imm: +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen offset:52 +define amdgpu_vs <4 x float> @tbuffer_load_ofs_imm(<4 x i32> inreg, i32 %voffs) { +main_body: + %ofs = add i32 %voffs, 52 + %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 %ofs, i32 0, i32 78, i32 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + ret <4 x float> %vdata.f +} + +; GCN-LABEL: {{^}}buffer_load_xy: +; GCN: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 +define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { + %vdata = call <2 x i32> @llvm.amdgcn.raw.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 77, i32 0) + %vdata.f = bitcast <2 x i32> %vdata to <2 x float> + ret <2 x float> %vdata.f +} + +; GCN-LABEL: {{^}}buffer_load_x: +; GCN: tbuffer_load_format_x {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 +define amdgpu_vs float @buffer_load_x(<4 x i32> inreg %rsrc) { + %vdata = call i32 @llvm.amdgcn.raw.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 77, i32 0) + %vdata.f = bitcast i32 %vdata to float + ret float %vdata.f +} + +declare i32 @llvm.amdgcn.raw.tbuffer.load.i32(<4 x i32>, i32, i32, i32, i32) +declare <2 x i32> @llvm.amdgcn.raw.tbuffer.load.v2i32(<4 x i32>, i32, i32, i32, i32) +declare <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) + Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll @@ -0,0 +1,57 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s + + +; GCN-LABEL: {{^}}tbuffer_store_d16_x: +; GCN: s_load_dwordx4 +; GCN: s_load_dword s[[S_LO:[0-9]+]] +; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]] +; GCN: tbuffer_store_format_d16_x v[[V_LO]], off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 +define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) { +main_body: + call void @llvm.amdgcn.raw.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0) + ret void +} + +; GCN-LABEL: {{^}}tbuffer_store_d16_xy: +; GCN: s_load_dword [[S_DATA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 +; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[S_DATA]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}} +; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]] +; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]] +; UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 + +; PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 +define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data) { +main_body: + call void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0) + ret void +} + +; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw: +; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10 + +; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} +; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] + +; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] +; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] +; UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 + + +; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] +; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]] +; PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 +define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data) { +main_body: + call void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.tbuffer.store.f16(half, <4 x i32>, i32, i32, i32, i32) +declare void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32) +declare void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32) Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll @@ -0,0 +1,75 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=GCN,VERDE %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}tbuffer_store: +; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:12, nfmt:2, 0 +; GCN: tbuffer_store_format_xyzw v[4:7], off, s[0:3], dfmt:13, nfmt:3, 0 glc +; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], dfmt:14, nfmt:4, 0 slc +; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], dfmt:14, nfmt:4, 0 +define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { +main_body: + %in1 = bitcast <4 x float> %1 to <4 x i32> + %in2 = bitcast <4 x float> %2 to <4 x i32> + %in3 = bitcast <4 x float> %3 to <4 x i32> + call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 44, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in2, <4 x i32> %0, i32 0, i32 0, i32 61, i32 1) + call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in3, <4 x i32> %0, i32 0, i32 0, i32 78, i32 2) + call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 78, i32 0) + ret void +} + +; GCN-LABEL: {{^}}tbuffer_store_immoffs: +; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:5, nfmt:7, 0 offset:42 +define amdgpu_ps void @tbuffer_store_immoffs(<4 x i32> inreg, <4 x float>) { +main_body: + %in1 = bitcast <4 x float> %1 to <4 x i32> + call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 42, i32 0, i32 117, i32 0) + ret void +} + +; GCN-LABEL: {{^}}tbuffer_store_scalar_and_imm_offs: +; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:5, nfmt:7, {{s[0-9]+}} offset:42 +define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(<4 x i32> inreg, <4 x float> %vdata, i32 inreg %soffset) { +main_body: + %in1 = bitcast <4 x float> %vdata to <4 x i32> + call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 42, i32 %soffset, i32 117, i32 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_ofs: +; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:3, nfmt:7, 0 offen +define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float> %vdata, i32 %voffset) { +main_body: + %in1 = bitcast <4 x float> %vdata to <4 x i32> + call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %voffset, i32 0, i32 115, i32 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_x1: +; GCN: tbuffer_store_format_x v0, off, s[0:3], dfmt:13, nfmt:7, 0 +define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data) { +main_body: + %data.i = bitcast float %data to i32 + call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %data.i, <4 x i32> %rsrc, i32 0, i32 0, i32 125, i32 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_x2: +; GCN: tbuffer_store_format_xy v[0:1], off, s[0:3], dfmt:1, nfmt:2, 0 +define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data) { +main_body: + %data.i = bitcast <2 x float> %data to <2 x i32> + call void @llvm.amdgcn.raw.tbuffer.store.v2i32(<2 x i32> %data.i, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.tbuffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i1, i1) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } + + Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll @@ -0,0 +1,45 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s + +; GCN-LABEL: {{^}}tbuffer_load_d16_x: +; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 +; GCN: tbuffer_load_format_d16_x v{{[0-9]+}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen +define amdgpu_ps half @tbuffer_load_d16_x(<4 x i32> inreg %rsrc) { +main_body: + %data = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 22, i32 0) + ret half %data +} + +; GCN-LABEL: {{^}}tbuffer_load_d16_xy: +; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 +; UNPACKED: tbuffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen +; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]] +define amdgpu_ps half @tbuffer_load_d16_xy(<4 x i32> inreg %rsrc) { +main_body: + %data = call <2 x half> @llvm.amdgcn.struct.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 22, i32 0) + %elt = extractelement <2 x half> %data, i32 1 + ret half %elt +} + +; GCN-LABEL: {{^}}tbuffer_load_d16_xyzw: +; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 +; UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen +; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]] +define amdgpu_ps half @tbuffer_load_d16_xyzw(<4 x i32> inreg %rsrc) { +main_body: + %data = call <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 22, i32 0) + %elt = extractelement <4 x half> %data, i32 3 + ret half %elt +} + +declare half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32>, i32, i32, i32, i32, i32) +declare <2 x half> @llvm.amdgcn.struct.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32, i32, i32) +declare <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, i32, i32) + Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll @@ -0,0 +1,114 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}tbuffer_load: +; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:3, 0 idxen glc +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0 idxen slc +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0 idxen +; GCN: s_waitcnt +define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) { +main_body: + %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 78, i32 0) + %vdata_glc = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 63, i32 1) + %vdata_slc = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 22, i32 2) + %vdata_f32 = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 22, i32 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + %vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float> + %vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float> + %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %vdata.f, 0 + %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %vdata_glc.f, 1 + %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %vdata_slc.f, 2 + %r3 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r2, <4 x float> %vdata_f32, 3 + ret {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r3 +} + +; GCN-LABEL: {{^}}tbuffer_load_immoffs: +; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offset:42 +define amdgpu_vs <4 x float> @tbuffer_load_immoffs(<4 x i32> inreg) { +main_body: + %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 42, i32 0, i32 78, i32 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + ret <4 x float> %vdata.f +} + +; GCN-LABEL: {{^}}tbuffer_load_immoffs_large +; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:2, 61 idxen offset:4095 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:3, {{s[0-9]+}} idxen offset:73 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, {{s[0-9]+}} idxen offset:1 +; GCN: s_waitcnt +define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(<4 x i32> inreg, i32 inreg %soffs) { + %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 4095, i32 61, i32 47, i32 0) + %vdata_glc = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 73, i32 %soffs, i32 62, i32 0) + %vdata_slc = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 1, i32 %soffs, i32 77, i32 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + %vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float> + %vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float> + %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %vdata.f, 0 + %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %vdata_glc.f, 1 + %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %vdata_slc.f, 2 + ret {<4 x float>, <4 x float>, <4 x float>} %r2 +} + +; GCN-LABEL: {{^}}tbuffer_load_idx: +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen +define amdgpu_vs <4 x float> @tbuffer_load_idx(<4 x i32> inreg, i32 %vindex) { +main_body: + %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 78, i32 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + ret <4 x float> %vdata.f +} + +; GCN-LABEL: {{^}}tbuffer_load_ofs: +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offen +define amdgpu_vs <4 x float> @tbuffer_load_ofs(<4 x i32> inreg, i32 %voffs) { +main_body: + %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %voffs, i32 0, i32 78, i32 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + ret <4 x float> %vdata.f +} + +; GCN-LABEL: {{^}}tbuffer_load_ofs_imm: +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offen offset:52 +define amdgpu_vs <4 x float> @tbuffer_load_ofs_imm(<4 x i32> inreg, i32 %voffs) { +main_body: + %ofs = add i32 %voffs, 52 + %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %ofs, i32 0, i32 78, i32 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + ret <4 x float> %vdata.f +} + +; GCN-LABEL: {{^}}tbuffer_load_both: +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offen +define amdgpu_vs <4 x float> @tbuffer_load_both(<4 x i32> inreg, i32 %vindex, i32 %voffs) { +main_body: + %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 %voffs, i32 0, i32 78, i32 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + ret <4 x float> %vdata.f +} + + +; GCN-LABEL: {{^}}buffer_load_xy: +; GCN: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 idxen +define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { + %vdata = call <2 x i32> @llvm.amdgcn.struct.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 77, i32 0) + %vdata.f = bitcast <2 x i32> %vdata to <2 x float> + ret <2 x float> %vdata.f +} + +; GCN-LABEL: {{^}}buffer_load_x: +; GCN: tbuffer_load_format_x {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 idxen +define amdgpu_vs float @buffer_load_x(<4 x i32> inreg %rsrc) { + %vdata = call i32 @llvm.amdgcn.struct.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 77, i32 0) + %vdata.f = bitcast i32 %vdata to float + ret float %vdata.f +} + +declare i32 @llvm.amdgcn.struct.tbuffer.load.i32(<4 x i32>, i32, i32, i32, i32, i32) +declare <2 x i32> @llvm.amdgcn.struct.tbuffer.load.v2i32(<4 x i32>, i32, i32, i32, i32, i32) +declare <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32, i32) + Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll @@ -0,0 +1,57 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s + + +; GCN-LABEL: {{^}}tbuffer_store_d16_x: +; GCN: s_load_dwordx4 +; GCN: s_load_dword{{[x0-9]*}} s{{\[}}[[S_LO:[0-9]+]] +; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]] +; GCN: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 %vindex) { +main_body: + call void @llvm.amdgcn.struct.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) + ret void +} + +; GCN-LABEL: {{^}}tbuffer_store_d16_xy: +; GCN: s_load_dword [[S_DATA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 +; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[S_DATA]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}} +; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]] +; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]] +; UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen + +; PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %vindex) { +main_body: + call void @llvm.amdgcn.struct.tbuffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) + ret void +} + +; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw: +; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10 + +; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} +; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] + +; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] +; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] +; UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen + + +; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] +; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]] +; PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) { +main_body: + call void @llvm.amdgcn.struct.tbuffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.tbuffer.store.f16(half, <4 x i32>, i32, i32, i32, i32, i32) +declare void @llvm.amdgcn.struct.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32, i32) +declare void @llvm.amdgcn.struct.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32, i32) Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll @@ -0,0 +1,114 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=GCN,VERDE %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}tbuffer_store: +; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 +; GCN: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], dfmt:12, nfmt:2, 0 idxen +; GCN: tbuffer_store_format_xyzw v[4:7], [[ZEROREG]], s[0:3], dfmt:13, nfmt:3, 0 idxen glc +; GCN: tbuffer_store_format_xyzw v[8:11], [[ZEROREG]], s[0:3], dfmt:14, nfmt:4, 0 idxen slc +; GCN: tbuffer_store_format_xyzw v[8:11], [[ZEROREG]], s[0:3], dfmt:14, nfmt:4, 0 idxen +define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { +main_body: + %in1 = bitcast <4 x float> %1 to <4 x i32> + %in2 = bitcast <4 x float> %2 to <4 x i32> + %in3 = bitcast <4 x float> %3 to <4 x i32> + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 44, i32 0) + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 61, i32 1) + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 78, i32 2) + call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 78, i32 0) + ret void +} + +; GCN-LABEL: {{^}}tbuffer_store_immoffs: +; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 +; GCN: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], dfmt:5, nfmt:7, 0 idxen offset:42 +define amdgpu_ps void @tbuffer_store_immoffs(<4 x i32> inreg, <4 x float>) { +main_body: + %in1 = bitcast <4 x float> %1 to <4 x i32> + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 42, i32 0, i32 117, i32 0) + ret void +} + +; GCN-LABEL: {{^}}tbuffer_store_scalar_and_imm_offs: +; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 +; GCN: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], dfmt:5, nfmt:7, {{s[0-9]+}} idxen offset:42 +define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(<4 x i32> inreg, <4 x float> %vdata, i32 inreg %soffset) { +main_body: + %in1 = bitcast <4 x float> %vdata to <4 x i32> + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 42, i32 %soffset, i32 117, i32 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_idx: +; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:15, nfmt:2, 0 idxen +define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex) { +main_body: + %in1 = bitcast <4 x float> %vdata to <4 x i32> + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 47, i32 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_ofs: +; GCN: tbuffer_store_format_xyzw v[0:3], {{v\[[0-9]+:[0-9]+\]}}, s[0:3], dfmt:3, nfmt:7, 0 idxen offen +define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float> %vdata, i32 %voffset) { +main_body: + %in1 = bitcast <4 x float> %vdata to <4 x i32> + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 %voffset, i32 0, i32 115, i32 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_both: +; GCN: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], dfmt:6, nfmt:4, 0 idxen offen +define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex, i32 %voffset) { +main_body: + %in1 = bitcast <4 x float> %vdata to <4 x i32> + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex, i32 %voffset, i32 0, i32 70, i32 0) + ret void +} + +; Ideally, the register allocator would avoid the wait here +; +; GCN-LABEL: {{^}}buffer_store_wait: +; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:15, nfmt:3, 0 idxen +; VERDE: s_waitcnt expcnt(0) +; GCN: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen +; GCN: s_waitcnt vmcnt(0) +; GCN: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], dfmt:14, nfmt:2, 0 idxen +define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex.1, i32 %vindex.2, i32 %vindex.3) { +main_body: + %in1 = bitcast <4 x float> %vdata to <4 x i32> + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex.1, i32 0, i32 0, i32 63, i32 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %vindex.2, i32 0, i1 0, i1 0) + %data.i = bitcast <4 x float> %data to <4 x i32> + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %data.i, <4 x i32> %0, i32 %vindex.3, i32 0, i32 0, i32 46, i32 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_x1: +; GCN: tbuffer_store_format_x v0, v1, s[0:3], dfmt:13, nfmt:7, 0 idxen +define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { +main_body: + %data.i = bitcast float %data to i32 + call void @llvm.amdgcn.struct.tbuffer.store.i32(i32 %data.i, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 125, i32 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_x2: +; GCN: tbuffer_store_format_xy v[0:1], v2, s[0:3], dfmt:1, nfmt:2, 0 idxen +define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %vindex) { +main_body: + %data.i = bitcast <2 x float> %data to <2 x i32> + call void @llvm.amdgcn.struct.tbuffer.store.v2i32(<2 x i32> %data.i, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.tbuffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } + + Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll @@ -70,14 +70,14 @@ ; VERDE: s_waitcnt expcnt(0) ; GCN: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen ; GCN: s_waitcnt vmcnt(0) -; GCN: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], dfmt:16, nfmt:2, 0 idxen +; GCN: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], dfmt:14, nfmt:2, 0 idxen define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex.1, i32 %vindex.2, i32 %vindex.3) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex.1, i32 0, i32 0, i32 0, i32 15, i32 3, i1 0, i1 0) %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %vindex.2, i32 0, i1 0, i1 0) %data.i = bitcast <4 x float> %data to <4 x i32> - call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %data.i, <4 x i32> %0, i32 %vindex.3, i32 0, i32 0, i32 0, i32 16, i32 2, i1 0, i1 0) + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %data.i, <4 x i32> %0, i32 %vindex.3, i32 0, i32 0, i32 0, i32 14, i32 2, i1 0, i1 0) ret void } Index: llvm/trunk/test/MC/AMDGPU/mtbuf.s =================================================================== --- llvm/trunk/test/MC/AMDGPU/mtbuf.s +++ llvm/trunk/test/MC/AMDGPU/mtbuf.s @@ -38,3 +38,18 @@ // SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:2, ttmp1 ; encoding: [0x00,0x00,0x7f,0xe9,0x00,0x01,0x1d,0x71] // VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:2, ttmp1 ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x01,0x1d,0x71] +// nfmt is optional: +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, ttmp1 +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:0, ttmp1 ; encoding: [0x00,0x00,0x7f,0xe8,0x00,0x01,0x1d,0x71] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:0, ttmp1 ; encoding: [0x00,0x80,0x7b,0xe8,0x00,0x01,0x1d,0x71] + +// dfmt is optional: +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], nfmt:2, ttmp1 +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:0, nfmt:2, ttmp1 ; encoding: [0x00,0x00,0x07,0xe9,0x00,0x01,0x1d,0x71] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:0, nfmt:2, ttmp1 ; encoding: [0x00,0x80,0x03,0xe9,0x00,0x01,0x1d,0x71] + +// nfmt and dfmt can be in either order: +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], nfmt:2, dfmt:15, ttmp1 +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:2, ttmp1 ; encoding: [0x00,0x00,0x7f,0xe9,0x00,0x01,0x1d,0x71] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:2, ttmp1 ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x01,0x1d,0x71] +