Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -5066,8 +5066,23 @@ } else { assert(OldSAddrIdx == NewVAddrIdx); - if (OldVAddrIdx >= 0) + if (OldVAddrIdx >= 0) { + int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst_in); + + // RemoveOperand doesn't try to fixup tied operand indexes at it goes, so + // it asserts. Untie the operands for now and retie them afterwards. + if (NewVDstIn != -1) { + int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in); + Inst.untieRegOperand(OldVDstIn); + } + Inst.RemoveOperand(OldVAddrIdx); + + if (NewVDstIn != -1) { + int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst); + Inst.tieOperands(NewVDst, NewVDstIn); + } + } } if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg())) Index: llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll +++ llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll @@ -31,3 +31,27 @@ %i9 = icmp eq i32 %i8, 256 br i1 %i9, label %bb2, label %bb3 } + +; GCN-LABEL: {{^}}test_move_load_address_to_vgpr_d16_hi: +; GCN-NOT: v_readfirstlane_b32 +; GCN: global_load_short_d16_hi v{{[0-9]+}}, v[{{[0-9:]+}}], off glc +define amdgpu_kernel void @test_move_load_address_to_vgpr_d16_hi(i16 addrspace(1)* nocapture %arg) { +bb: + %i1 = getelementptr inbounds i16, i16 addrspace(1)* %arg, i64 0 + %load.pre = load volatile i16, i16 addrspace(1)* %i1, align 4 + %i2 = zext i16 %load.pre to i32 + br label %bb3 + +bb2: ; preds = %bb3 + ret void + +bb3: ; preds = %bb3, %bb + %i = phi i32 [ %i2, %bb ], [ %i8, %bb3 ] + %i4 = zext i32 %i to i64 + %i5 = getelementptr inbounds i16, i16 addrspace(1)* %arg, i64 %i4 + %i6 = load volatile i16, i16 addrspace(1)* %i5, align 4 + %insertelt = insertelement <2 x i16> undef, i16 %i6, i32 1 + %i8 = bitcast <2 x i16> %insertelt to i32 + %i9 = icmp eq i32 %i8, 256 + br i1 %i9, label %bb2, label %bb3 +}