Index: lib/Target/AMDGPU/MIMGInstructions.td =================================================================== --- lib/Target/AMDGPU/MIMGInstructions.td +++ lib/Target/AMDGPU/MIMGInstructions.td @@ -25,6 +25,7 @@ let DecoderNamespace = dns; let isAsmParserOnly = !if(!eq(dns,""), 1, 0); let AsmMatchConverter = "cvtMIMG"; + let usesCustomInserter = 1; } class MIMG_NoSampler_Helper op, string asm, Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -1688,9 +1688,32 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( MachineInstr &MI, MachineBasicBlock *BB) const { + + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + MachineFunction *MF = BB->getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo(); + + if (TII->isMIMG(MI)) { + if (!MI.memoperands_empty()) + return BB; + // Add a memoperand for mimg instructions so that they aren't assumed to + // be ordered memory instuctions. + + MachinePointerInfo PtrInfo(MFI->getImagePSV()); + MachineMemOperand::Flags Flags = MachineMemOperand::MODereferenceable; + if (MI.mayStore()) + Flags |= MachineMemOperand::MOStore; + + if (MI.mayLoad()) + Flags |= MachineMemOperand::MOLoad; + + auto MMO = MF->getMachineMemOperand(PtrInfo, Flags, 0, 0); + MI.addMemOperand(*MF, MMO); + return BB; + } + switch (MI.getOpcode()) { case AMDGPU::SI_INIT_M0: { - const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) .addOperand(MI.getOperand(0)); @@ -1698,10 +1721,6 @@ return BB; } case AMDGPU::GET_GROUPSTATICSIZE: { - const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); - - MachineFunction *MF = BB->getParent(); - SIMachineFunctionInfo *MFI = MF->getInfo(); DebugLoc DL = MI.getDebugLoc(); BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32)) .addOperand(MI.getOperand(0)) @@ -1725,7 +1744,6 @@ return splitKillBlock(MI, BB); case AMDGPU::V_CNDMASK_B64_PSEUDO: { MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); unsigned Dst = MI.getOperand(0).getReg(); unsigned Src0 = MI.getOperand(1).getReg(); Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -23,6 +23,31 @@ class MachineRegisterInfo; +class AMDGPUImagePseudoSourceValue : public PseudoSourceValue { +public: + explicit AMDGPUImagePseudoSourceValue() : + PseudoSourceValue(PseudoSourceValue::TargetCustom) { } + + bool isConstant(const MachineFrameInfo *) const override { + // This should probably be true for most images, but we will start by being + // conservative. + return false; + } + + bool isAliased(const MachineFrameInfo *) const override { + // FIXME: If we ever change image intrinsics to accept fat pointers, then + // this could be true for some cases. + return false; + } + + bool mayAlias(const MachineFrameInfo*) const override { + // FIXME: If we ever change image intrinsics to accept fat pointers, then + // this could be true for some cases. + return false; + } +}; + + /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which /// tells the hardware which interpolation parameters to load. class SIMachineFunctionInfo final : public AMDGPUMachineFunction { @@ -73,6 +98,8 @@ // Stack object indices for work item IDs. std::array DebuggerWorkItemIDStackObjectIndices; + std::unique_ptr ImagePSV; + public: // FIXME: Make private unsigned LDSWaveSpillSize; @@ -434,6 +461,10 @@ } llvm_unreachable("unexpected dimension"); } + + AMDGPUImagePseudoSourceValue *getImagePSV() { + return ImagePSV.get(); + } }; } // End namespace llvm Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -52,6 +52,7 @@ WavesPerEU(0, 0), DebuggerWorkGroupIDStackObjectIndices({{0, 0, 0}}), DebuggerWorkItemIDStackObjectIndices({{0, 0, 0}}), + ImagePSV(llvm::make_unique()), LDSWaveSpillSize(0), PSInputEna(0), NumUserSGPRs(0), Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.image.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.ll @@ -1,5 +1,5 @@ ;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefixes=CHECK,VI %s ;CHECK-LABEL: {{^}}image_load_v4i32: ;CHECK: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm @@ -144,6 +144,19 @@ ret void } +; SI won't merge ds memory operations, because of the signed offset bug, so +; we only have check lines for VI. +; VI-LABEL: image_load_mmo +; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 +; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4 +define amdgpu_ps void @image_load_mmo(float addrspace(3)* %lds, <2 x i32> %c, <8 x i32> inreg %rsrc) { + store float 0.0, float addrspace(3)* %lds + %tex = call float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4 + store float 0.0, float addrspace(3)* %tmp2 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tex, float %tex, float %tex, float %tex) + ret void +} declare float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 declare <2 x float> @llvm.amdgcn.image.load.v2f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1