Index: llvm/trunk/lib/Target/AMDGPU/SIDefines.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIDefines.h +++ llvm/trunk/lib/Target/AMDGPU/SIDefines.h @@ -40,7 +40,8 @@ FLAT = 1 << 21, WQM = 1 << 22, VGPRSpill = 1 << 23, - VOPAsmPrefer32Bit = 1 << 24 + VOPAsmPrefer32Bit = 1 << 24, + Gather4 = 1 << 25 }; } Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3133,7 +3133,8 @@ const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); unsigned Opcode = Node->getMachineOpcode(); - if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore()) + if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() && + !TII->isGather4(Opcode)) adjustWritemask(Node, DAG); if (Opcode == AMDGPU::INSERT_SUBREG || Index: llvm/trunk/lib/Target/AMDGPU/SIInstrFormats.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrFormats.td +++ llvm/trunk/lib/Target/AMDGPU/SIInstrFormats.td @@ -48,6 +48,8 @@ // is unable to infer the encoding from the operands. field bits<1> VOPAsmPrefer32Bit = 0; + field bits<1> Gather4 = 0; + // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = VM_CNT; let TSFlags{1} = EXP_CNT; @@ -78,6 +80,7 @@ let TSFlags{22} = WQM; let TSFlags{23} = VGPRSpill; let TSFlags{24} = VOPAsmPrefer32Bit; + let TSFlags{25} = Gather4; let SchedRW = [Write32Bit]; Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h @@ -310,6 +310,14 @@ return get(Opcode).TSFlags & SIInstrFlags::MIMG; } + static bool isGather4(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::Gather4; + } + + bool isGather4(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::Gather4; + } + static bool isFLAT(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::FLAT; } Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td @@ -3557,8 +3557,8 @@ // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns // (red,red,red,red) etc.) The ISA document doesn't mention // this. - // Therefore, disable all code which updates DMASK by setting these two: - let MIMG = 0; + // Therefore, disable all code which updates DMASK by setting this: + let Gather4 = 1; let hasPostISelHook = 0; let WQM = wqm; Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.SI.gather4.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.SI.gather4.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.SI.gather4.ll @@ -462,7 +462,27 @@ ret void } - +;CHECK-LABEL: {{^}}gather4_sgpr_bug: +; +; This crashed at some point due to a bug in FixSGPRCopies. Derived from the +; report in https://bugs.freedesktop.org/show_bug.cgi?id=96877 +; +;TODO: the readfirstlanes are unnecessary, see http://reviews.llvm.org/D22217 +; +;CHECK: v_readfirstlane_b32 s[[LO:[0-9]+]], v{{[0-9]+}} +;CHECK: v_readfirstlane_b32 +;CHECK: v_readfirstlane_b32 +;CHECK: v_readfirstlane_b32 s[[HI:[0-9]+]], v{{[0-9]+}} +;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, s{{\[}}[[LO]]:[[HI]]] dmask:0x8 +define amdgpu_ps float @gather4_sgpr_bug() { +main_body: + %tmp = load <4 x i32>, <4 x i32> addrspace(2)* undef, align 16 + %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0 + %tmp2 = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> %tmp1, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp4 = extractelement <4 x float> %tmp2, i32 1 + %tmp9 = fadd float undef, %tmp4 + ret float %tmp9 +} declare <4 x float> @llvm.SI.gather4.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 declare <4 x float> @llvm.SI.gather4.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0