Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -6374,7 +6374,7 @@ /// \brief Adjust the writemask of MIMG instructions void SITargetLowering::adjustWritemask(MachineSDNode *&Node, SelectionDAG &DAG) const { - SDNode *Users[4] = { }; + SDNode *Users[4] = { nullptr }; unsigned Lane = 0; unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3; unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx); @@ -6426,18 +6426,6 @@ Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end()); Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops); - // If we only got one lane, replace it with a copy - // (if NewDmask has only one bit set...) - if (NewDmask && (NewDmask & (NewDmask-1)) == 0) { - SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(), - MVT::i32); - SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, - SDLoc(), Users[Lane]->getValueType(0), - SDValue(Node, 0), RC); - DAG.ReplaceAllUsesWith(Users[Lane], Copy); - return; - } - // Update the users of the node with the new indices for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { SDNode *User = Users[i]; @@ -6606,18 +6594,41 @@ unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4; unsigned Writemask = MI.getOperand(DmaskIdx).getImm(); unsigned BitsSet = 0; - for (unsigned i = 0; i < 4; ++i) - BitsSet += Writemask & (1 << i) ? 1 : 0; + unsigned SubRegIdx; + for (unsigned i = 0; i < 4; ++i) + BitsSet += Writemask & (1 << i) ? 1 : 0; switch (BitsSet) { - default: return; - case 1: RC = &AMDGPU::VGPR_32RegClass; break; - case 2: RC = &AMDGPU::VReg_64RegClass; break; - case 3: RC = &AMDGPU::VReg_96RegClass; break; + default: + return; + case 1: + RC = &AMDGPU::VGPR_32RegClass; + SubRegIdx = AMDGPU::sub0; + break; + case 2: + RC = &AMDGPU::VReg_64RegClass; + SubRegIdx = AMDGPU::sub0_sub1; + break; + case 3: + RC = &AMDGPU::VReg_96RegClass; + SubRegIdx = AMDGPU::sub0_sub1_sub2; + break; } + auto InsPt = std::next(MI.getIterator()); + + unsigned TmpSuperReg = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass); + unsigned TmpReg = MRI.createVirtualRegister(RC); + unsigned NewOpcode = TII->getMaskedMIMGOp(MI.getOpcode(), BitsSet); MI.setDesc(TII->get(NewOpcode)); - MRI.setRegClass(VReg, RC); + MI.getOperand(0).setReg(TmpReg); + + const DebugLoc &DL = MI.getDebugLoc(); + BuildMI(*MI.getParent(), InsPt, DL, TII->get(AMDGPU::IMPLICIT_DEF), TmpSuperReg); + BuildMI(*MI.getParent(), InsPt, DL, TII->get(AMDGPU::INSERT_SUBREG), VReg) + .addReg(TmpSuperReg) + .addReg(TmpReg) + .addImm(SubRegIdx); return; } Index: test/CodeGen/AMDGPU/adjust-writemask-invalid-copy.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/adjust-writemask-invalid-copy.ll @@ -0,0 +1,38 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}adjust_writemask_crash_0: +; GCN: image_get_lod v[0:1], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x2 +; GCN-NOT: v1 +; GCN-NOT: v0 +; GCN: buffer_store_dword v0 +define amdgpu_ps void @adjust_writemask_crash_0() #0 { +main_body: + %tmp = call <2 x float> @llvm.amdgcn.image.getlod.v2f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp1 = bitcast <2 x float> %tmp to <2 x i32> + %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> + %tmp3 = bitcast <4 x i32> %tmp2 to <4 x float> + %tmp4 = extractelement <4 x float> %tmp3, i32 0 + store volatile float %tmp4, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_crash_1: +; GCN: image_get_lod v[0:1], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x1 +; GCN-NOT: v1 +; GCN-NOT: v0 +; GCN: buffer_store_dword v0 +define amdgpu_ps void @adjust_writemask_crash_1() #0 { +main_body: + %tmp = call <2 x float> @llvm.amdgcn.image.getlod.v2f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp1 = bitcast <2 x float> %tmp to <2 x i32> + %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> + %tmp3 = bitcast <4 x i32> %tmp2 to <4 x float> + %tmp4 = extractelement <4 x float> %tmp3, i32 1 + store volatile float %tmp4, float addrspace(1)* undef + ret void +} + +declare <2 x float> @llvm.amdgcn.image.getlod.v2f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly }