Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -768,6 +768,19 @@ GCCBuiltin<"__builtin_amdgcn_readlane">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>; +// The value to write and lane select arguments must be uniform across the +// currently active threads of the current wave. Otherwise, the result is +// undefined. +def int_amdgcn_writelane : + GCCBuiltin<"__builtin_amdgcn_writelane">, + Intrinsic<[llvm_i32_ty], [ + llvm_i32_ty, // uniform value to write: returned by the selected lane + llvm_i32_ty, // uniform lane select + llvm_i32_ty // returned by all lanes other than the selected one + ], + [IntrNoMem, IntrConvergent] +>; + def int_amdgcn_alignbit : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrSpeculatable] Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2711,8 +2711,9 @@ } } - // Verify VOP* - if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI)) { + // Verify VOP*. Ignore multiple sgpr operands on writelane. + if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32 + && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) { // Only look at the true operands. Only a real operand can use the constant // bus, and we don't want to check pseudo-operands like the source modifier // flags. @@ -3147,6 +3148,29 @@ legalizeOpWithMove(MI, Src0Idx); } + // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for + // both the value to write (src0) and lane select (src1). Fix up non-SGPR + // src0/src1 with V_READFIRSTLANE. + if (Opc == AMDGPU::V_WRITELANE_B32) { + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + MachineOperand &Src0 = MI.getOperand(Src0Idx); + const DebugLoc &DL = MI.getDebugLoc(); + if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) { + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) + .add(Src0); + Src0.ChangeToRegister(Reg, false); + } + if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) { + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + const DebugLoc &DL = MI.getDebugLoc(); + BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) + .add(Src1); + Src1.ChangeToRegister(Reg, false); + } + return; + } + // VOP2 src0 instructions support all operand types, so we don't need to check // their legality. If src1 is already legal, we don't need to do anything. if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -732,11 +732,18 @@ if (SpillToVGPR) { SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; + // FIXME: Using Undef on the "old value of VGPR" operand here is a hack + // to avoid MachineVerifier objecting when that same physical VGPR was + // killed by an earlier instruction. The way that VGPRs are allocated for + // SGPR spilling is hacky, and the undef flags are incorrect, and could + // possibly cause subsequent problems. Although it is no more incorrect + // than it was before adding this extra operand to V_WRITELANE_B32. BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), Spill.VGPR) .addReg(SubReg, getKillRegState(IsKill)) - .addImm(Spill.Lane); + .addImm(Spill.Lane) + .addReg(Spill.VGPR, RegState::Undef); // FIXME: Since this spills to another register instead of an actual // frame index, we should delete the frame index when all references to Index: lib/Target/AMDGPU/VOP2Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP2Instructions.td +++ lib/Target/AMDGPU/VOP2Instructions.td @@ -323,15 +323,17 @@ let HasSDWA9 = 0; } -def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> { +def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> { let Outs32 = (outs VGPR_32:$vdst); let Outs64 = Outs32; - let Ins32 = (ins SCSrc_b32:$src0, SCSrc_b32:$src1); + let Ins32 = (ins SCSrc_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in); let Ins64 = Ins32; let Asm32 = " $vdst, $src0, $src1"; let Asm64 = Asm32; let HasExt = 0; let HasSDWA9 = 0; + let HasSrc2 = 0; + let HasSrc2Mods = 0; } //===----------------------------------------------------------------------===// @@ -399,7 +401,10 @@ def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE, [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))], "">; -def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, [], "">; +let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in { +def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, + [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))], "">; +} // End $vdst = $vdst_in, DisableEncoding $vdst_in } // End isConvergent = 1 defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_NO_EXT>; @@ -640,7 +645,7 @@ defm V_READLANE_B32 : VOP2_Real_si <0x01>; -let InOperandList = (ins SSrc_b32:$src0, SCSrc_b32:$src1) in { +let InOperandList = (ins SSrc_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) in { defm V_WRITELANE_B32 : VOP2_Real_si <0x02>; } Index: test/CodeGen/AMDGPU/byval-frame-setup.ll =================================================================== --- test/CodeGen/AMDGPU/byval-frame-setup.ll +++ test/CodeGen/AMDGPU/byval-frame-setup.ll @@ -33,16 +33,14 @@ ; GCN-DAG: buffer_store_dword v32 ; GCN-DAG: buffer_store_dword v33 ; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32 -; GCN: v_writelane_b32 - +; GCN-DAG: v_writelane_b32 ; GCN-DAG: s_add_u32 s32, s32, 0xb00{{$}} - ; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:4{{$}} -; GCN: v_add_{{[iu]}}32_e32 [[ADD0:v[0-9]+]], vcc, 1, [[LOAD0]] -; GCN: buffer_store_dword [[ADD0]], off, s[0:3], s5 offset:4{{$}} +; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD0:v[0-9]+]], vcc, 1, [[LOAD0]] +; GCN-DAG: buffer_store_dword [[ADD0]], off, s[0:3], s5 offset:4{{$}} -; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:20{{$}} -; GCN: v_add_{{[iu]}}32_e32 [[ADD1:v[0-9]+]], vcc, 2, [[LOAD1]] +; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:20{{$}} +; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD1:v[0-9]+]], vcc, 2, [[LOAD1]] ; GCN: s_swappc_b64 @@ -80,10 +78,10 @@ ; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:8 ; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:24 -; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:8 -; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:12 -; GCN: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:16 -; GCN: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:20 +; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:8 +; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:12 +; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:16 +; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:20 ; GCN-NOT: s_add_u32 s32, s32, 0x800 Index: test/CodeGen/AMDGPU/callee-frame-setup.ll =================================================================== --- test/CodeGen/AMDGPU/callee-frame-setup.ll +++ test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -44,7 +44,7 @@ ; GCN-DAG: v_writelane_b32 v32, s35, ; GCN-DAG: s_add_u32 s32, s32, 0x300{{$}} ; GCN-DAG: v_mov_b32_e32 v0, 0{{$}} -; GCN: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}} +; GCN-DAG: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}} ; GCN-DAG: s_mov_b32 s33, s5 Index: test/CodeGen/AMDGPU/inserted-wait-states.mir =================================================================== --- test/CodeGen/AMDGPU/inserted-wait-states.mir +++ test/CodeGen/AMDGPU/inserted-wait-states.mir @@ -308,7 +308,7 @@ bb.1: $vgpr0,$sgpr0_sgpr1 = V_ADD_I32_e64 $vgpr1, $vgpr2, implicit $vcc, implicit $exec - $vgpr4 = V_WRITELANE_B32 $sgpr0, $sgpr0 + $vgpr4 = V_WRITELANE_B32 $sgpr0, $sgpr0, $vgpr4 S_BRANCH %bb.2 bb.2: @@ -318,7 +318,7 @@ bb.3: $vgpr0,implicit $vcc = V_ADD_I32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec - $vgpr4 = V_WRITELANE_B32 $sgpr4, $vcc_lo + $vgpr4 = V_WRITELANE_B32 $sgpr4, $vcc_lo, $vgpr4 S_ENDPGM ... Index: test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll @@ -0,0 +1,64 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck %s + +declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #0 + +; CHECK-LABEL: {{^}}test_writelane_sreg: +; CHECK: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +define amdgpu_kernel void @test_writelane_sreg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 { + %oldval = load i32, i32 addrspace(1)* %out + %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 %oldval) + store i32 %writelane, i32 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}test_writelane_imm_sreg: +; CHECK: v_writelane_b32 v{{[0-9]+}}, 32, s{{[0-9]+}} +define amdgpu_kernel void @test_writelane_imm_sreg(i32 addrspace(1)* %out, i32 %src1) #1 { + %oldval = load i32, i32 addrspace(1)* %out + %writelane = call i32 @llvm.amdgcn.writelane(i32 32, i32 %src1, i32 %oldval) + store i32 %writelane, i32 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}test_writelane_vreg_lane: +; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}} +; CHECK: v_writelane_b32 v{{[0-9]+}}, 12, [[LANE]] +define amdgpu_kernel void @test_writelane_vreg_lane(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.in = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 %tid + %args = load <2 x i32>, <2 x i32> addrspace(1)* %gep.in + %oldval = load i32, i32 addrspace(1)* %out + %lane = extractelement <2 x i32> %args, i32 1 + %writelane = call i32 @llvm.amdgcn.writelane(i32 12, i32 %lane, i32 %oldval) + store i32 %writelane, i32 addrspace(1)* %out, align 4 + ret void +} + +; TODO: m0 should be folded. +; CHECK-LABEL: {{^}}test_writelane_m0_sreg: +; CHECK: s_mov_b32 m0, -1 +; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0 +; CHECK: v_writelane_b32 v{{[0-9]+}}, [[COPY_M0]], s{{[0-9]+}} +define amdgpu_kernel void @test_writelane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 { + %oldval = load i32, i32 addrspace(1)* %out + %m0 = call i32 asm "s_mov_b32 m0, -1", "={M0}"() + %writelane = call i32 @llvm.amdgcn.writelane(i32 %m0, i32 %src1, i32 %oldval) + store i32 %writelane, i32 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}test_writelane_imm: +; CHECK: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 32 +define amdgpu_kernel void @test_writelane_imm(i32 addrspace(1)* %out, i32 %src0) #1 { + %oldval = load i32, i32 addrspace(1)* %out + %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 32, i32 %oldval) #0 + store i32 %writelane, i32 addrspace(1)* %out, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #2 + +attributes #0 = { nounwind readnone convergent } +attributes #1 = { nounwind } +attributes #2 = { nounwind readnone } Index: test/CodeGen/AMDGPU/sibling-call.ll =================================================================== --- test/CodeGen/AMDGPU/sibling-call.ll +++ test/CodeGen/AMDGPU/sibling-call.ll @@ -216,7 +216,7 @@ ; GCN-DAG: v_writelane_b32 v34, s35, 2 ; GCN-DAG: s_add_u32 s32, s32, 0x400 -; GCN: s_getpc_b64 +; GCN-DAG: s_getpc_b64 ; GCN: s_swappc_b64 ; GCN: s_getpc_b64 s[6:7]