Index: lib/CodeGen/PeepholeOptimizer.cpp =================================================================== --- lib/CodeGen/PeepholeOptimizer.cpp +++ lib/CodeGen/PeepholeOptimizer.cpp @@ -1239,6 +1239,45 @@ // Currently we haven't seen motivating example for that and we // want to avoid untested code. NumRewrittenCopies += Changed; + + // Fold simple extracts from REG_SEQUENCE to use the original source inserted + // into the super register. + // + // e.g. + // vreg0 = ... + // vreg1 = ... + // vreg2 = REG_SEQUENCE vreg0, sub0, vreg1, sub1 + // vreg3 = COPY vreg1, sub0 + // + // => vreg3 = COPY vreg0 + // + // TODO: Handle more complex cases with sub-subregisters: + // + // vreg0 = ... + // vreg1 = ... + // vreg2 = ... + // vreg3 = ... + // vreg4 = REG_SEQUENCE vreg0, sub0, vreg1, sub1 + // vreg5 = REG_SEQUENCE vreg2, sub0_sub1, vreg2, sub2, vreg3, sub3 + // vreg6 = COPY vreg3, sub0 + // => vreg6 = COPY vreg0 + MachineOperand &Src = MI->getOperand(1); + if (Src.getSubReg()) { + MachineInstr *SrcDef = MRI->getUniqueVRegDef(Src.getReg()); + if (SrcDef->getOpcode() == TargetOpcode::REG_SEQUENCE) { + for (unsigned I = 1, E = SrcDef->getNumOperands(); I != E; I += 2) { + if (Src.getSubReg() == SrcDef->getOperand(I + 1).getImm()) { + MachineOperand &Op = SrcDef->getOperand(I); + Src.setReg(Op.getReg()); + Src.setSubReg(Op.getSubReg()); + MRI->clearKillFlags(Op.getReg()); + Changed = true; + break; + } + } + } + } + return Changed; } Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1856,15 +1856,10 @@ } MachineBasicBlock &MBB = *MI->getParent(); - // Extract the ptr from the resource descriptor. - - // SRsrcPtrLo = srsrc:sub0 - unsigned SRsrcPtrLo = buildExtractSubReg(MI, MRI, *SRsrc, - &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VGPR_32RegClass); - // SRsrcPtrHi = srsrc:sub1 - unsigned SRsrcPtrHi = buildExtractSubReg(MI, MRI, *SRsrc, - &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VGPR_32RegClass); + // Extract the ptr from the resource descriptor. + unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc, + &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); // Create an empty resource descriptor unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); @@ -1889,36 +1884,39 @@ .addImm(RsrcDataFormat >> 32); // NewSRsrc = {Zero64, SRsrcFormat} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), - NewSRsrc) - .addReg(Zero64) - .addImm(AMDGPU::sub0_sub1) - .addReg(SRsrcFormatLo) - .addImm(AMDGPU::sub2) - .addReg(SRsrcFormatHi) - .addImm(AMDGPU::sub3); + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc) + .addReg(Zero64) + .addImm(AMDGPU::sub0_sub1) + .addReg(SRsrcFormatLo) + .addImm(AMDGPU::sub2) + .addReg(SRsrcFormatHi) + .addImm(AMDGPU::sub3); MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); - unsigned NewVAddrLo; - unsigned NewVAddrHi; if (VAddr) { // This is already an ADDR64 instruction so we need to add the pointer // extracted from the resource descriptor to the current value of VAddr. - NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - // NewVaddrLo = SRsrcPtrLo + VAddr:sub0 + // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0 DebugLoc DL = MI->getDebugLoc(); BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) - .addReg(SRsrcPtrLo) + .addReg(SRsrcPtr, 0, AMDGPU::sub0) .addReg(VAddr->getReg(), 0, AMDGPU::sub0); - // NewVaddrHi = SRsrcPtrHi + VAddr:sub1 + // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi) - .addReg(SRsrcPtrHi) + .addReg(SRsrcPtr, 0, AMDGPU::sub1) .addReg(VAddr->getReg(), 0, AMDGPU::sub1); + // NewVaddr = {NewVaddrHi, NewVaddrLo} + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) + .addReg(NewVAddrLo) + .addImm(AMDGPU::sub0) + .addReg(NewVAddrHi) + .addImm(AMDGPU::sub1); } else { // This instructions is the _OFFSET variant, so we need to convert it to // ADDR64. @@ -1945,21 +1943,17 @@ MI->removeFromParent(); MI = Addr64; - NewVAddrLo = SRsrcPtrLo; - NewVAddrHi = SRsrcPtrHi; + // NewVaddr = {NewVaddrHi, NewVaddrLo} + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) + .addReg(SRsrcPtr, 0, AMDGPU::sub0) + .addImm(AMDGPU::sub0) + .addReg(SRsrcPtr, 0, AMDGPU::sub1) + .addImm(AMDGPU::sub1); + VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc); } - // NewVaddr = {NewVaddrHi, NewVaddrLo} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), - NewVAddr) - .addReg(NewVAddrLo) - .addImm(AMDGPU::sub0) - .addReg(NewVAddrHi) - .addImm(AMDGPU::sub1); - - // Update the instruction to use NewVaddr VAddr->setReg(NewVAddr); // Update the instruction to use NewSRsrc Index: test/CodeGen/AMDGPU/ds_read2_superreg.ll =================================================================== --- test/CodeGen/AMDGPU/ds_read2_superreg.ll +++ test/CodeGen/AMDGPU/ds_read2_superreg.ll @@ -35,14 +35,11 @@ ret void } -; FIXME: Shuffling to new superregister ; CI-LABEL: {{^}}simple_read2_v4f32_superreg_align4: -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}} -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_Y:[0-9]+]]:[[REG_X:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}} -; CI-DAG: v_mov_b32_e32 v[[COPY_REG_Y:[0-9]+]], v[[REG_Y]] -; CI-DAG: v_mov_b32_e32 v[[COPY_REG_Z:[0-9]+]], v[[REG_Z]] -; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[COPY_REG_Z]], v[[REG_X]] -; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_W]], v[[COPY_REG_Y]] +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}} +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_Z:[0-9]+]]:[[REG_W:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} +; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_Z]], v[[REG_X]] +; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_W]], v[[REG_Y]] ; CI: v_add_f32_e32 v[[ADD2:[0-9]+]], v[[ADD1]], v[[ADD0]] ; CI: buffer_store_dword v[[ADD2]] ; CI: s_endpgm @@ -64,11 +61,15 @@ ret void } + +; FIXME: the v_lshl_b64 x, x, 32 is a bad way of doing a copy + ; CI-LABEL: {{^}}simple_read2_v3f32_superreg_align4: ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}} ; CI-DAG: ds_read_b32 v[[REG_Z:[0-9]+]], v{{[0-9]+}} offset:8{{$}} +; CI: v_lshr_b64 v{{\[}}[[Y_COPY:[0-9]+]]:{{[0-9]+\]}}, v{{\[}}[[REG_X]]:[[REG_Y]]{{\]}}, 32 ; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_Z]], v[[REG_X]] -; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_Y]], v[[ADD0]] +; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[Y_COPY]], v[[ADD0]] ; CI: buffer_store_dword v[[ADD1]] ; CI: s_endpgm define void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 { @@ -140,13 +141,21 @@ ; CI-LABEL: {{^}}simple_read2_v16f32_superreg: ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:15 offset1:14{{$}} +; CI-NOT: v_mov_b32 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:13 offset1:12{{$}} +; CI-NOT: v_mov_b32 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:11 offset1:10{{$}} +; CI-NOT: v_mov_b32 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:9 offset1:8{{$}} +; CI-NOT: v_mov_b32 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:7 offset1:6{{$}} +; CI-NOT: v_mov_b32 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT5:[0-9]+]]:[[REG_ELT4:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:5 offset1:4{{$}} +; CI-NOT: v_mov_b32 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}} +; CI-NOT: v_mov_b32 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}} +; CI-NOT: v_mov_b32 ; CI: s_waitcnt lgkmcnt(0) ; CI: buffer_store_dword Index: test/CodeGen/AMDGPU/half.ll =================================================================== --- test/CodeGen/AMDGPU/half.ll +++ test/CodeGen/AMDGPU/half.ll @@ -310,13 +310,12 @@ ret void } -; FIXME: Shouldn't do 4th conversion ; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16: ; GCN: buffer_load_dwordx4 ; GCN: v_cvt_f16_f32_e32 ; GCN: v_cvt_f16_f32_e32 ; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 +; GCN-NOT: v_cvt_f16_f32_e32 ; GCN: buffer_store_short ; GCN: buffer_store_dword ; GCN: s_endpgm @@ -395,38 +394,38 @@ ; GCN: buffer_load_dword ; GCN: buffer_load_dword ; GCN: buffer_load_dword -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short ; GCN: s_endpgm define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { %val = load <16 x float>, <16 x float> addrspace(1)* %in Index: test/CodeGen/AMDGPU/llvm.round.f64.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.round.f64.ll +++ test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -21,12 +21,9 @@ ; SI-DAG: v_cmp_eq_i32 ; SI-DAG: s_mov_b32 [[BFIMASK:s[0-9]+]], 0x7fffffff -; SI-DAG: v_cmp_gt_i32_e64 +; SI-DAG: v_cmp_gt_i32_e32 ; SI-DAG: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[BFIMASK]] -; SI-DAG: v_cmp_gt_i32_e64 - - ; SI: buffer_store_dwordx2 ; SI: s_endpgm define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { Index: test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll @@ -0,0 +1,36 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCN %s + +; Check that when mubuf addr64 instruction is handled in moveToVALU +; from the pointer, dead register writes are not emitted. + +; FIXME: We should be able to use the SGPR directly as src0 to v_add_i32 + +; GCN-LABEL: {{^}}clobber_vgpr_pair_pointer_add: +; GCN: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}} +; GCN: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}} + +; GCN-NOT: v_mov_b32 +; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]] +; GCN-NEXT: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]] +; GCN-NOT: v_mov_b32 + +; GCN: v_add_i32_e32 v[[PTRLO:[0-9]+]], vcc, v[[LDPTRLO]], v[[VARG1LO]] +; GCN: v_addc_u32_e32 v[[PTRHI:[0-9]+]], vcc, v[[LDPTRHI]], v[[VARG1HI]] +; GCN: buffer_load_ubyte v{{[0-9]+}}, v{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, + +define void @clobber_vgpr_pair_pointer_add(i64 %arg1, i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 { +bb: + %tmp = icmp sgt i32 %arg3, 0 + br i1 %tmp, label %bb4, label %bb17 + +bb4: + %tmp14 = load volatile i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %ptrarg + %tmp15 = getelementptr inbounds i8, i8 addrspace(1)* %tmp14, i64 %arg1 + %tmp16 = load volatile i8, i8 addrspace(1)* %tmp15 + br label %bb17 + +bb17: + ret void +} + +attributes #0 = { nounwind }