Index: lib/Target/AMDGPU/SIShrinkInstructions.cpp =================================================================== --- lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -286,22 +286,27 @@ // satisfied. if (MI.getOpcode() == AMDGPU::S_ADD_I32 || MI.getOpcode() == AMDGPU::S_MUL_I32) { - const MachineOperand &Dest = MI.getOperand(0); - const MachineOperand &Src0 = MI.getOperand(1); - const MachineOperand &Src1 = MI.getOperand(2); + const MachineOperand *Dest = &MI.getOperand(0); + MachineOperand *Src0 = &MI.getOperand(1); + MachineOperand *Src1 = &MI.getOperand(2); + + if (!Src0->isReg() && Src1->isReg()) { + if (TII->commuteInstruction(MI, false, 1, 2)) + std::swap(Src0, Src1); + } // FIXME: This could work better if hints worked with subregisters. If // we have a vector add of a constant, we usually don't get the correct // allocation due to the subregister usage. - if (TargetRegisterInfo::isVirtualRegister(Dest.getReg()) && - Src0.isReg()) { - MRI.setRegAllocationHint(Dest.getReg(), 0, Src0.getReg()); - MRI.setRegAllocationHint(Src0.getReg(), 0, Dest.getReg()); + if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) && + Src0->isReg()) { + MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg()); + MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg()); continue; } - if (Src0.isReg() && Src0.getReg() == Dest.getReg()) { - if (Src1.isImm() && isKImmOperand(TII, Src1)) { + if (Src0->isReg() && Src0->getReg() == Dest->getReg()) { + if (Src1->isImm() && isKImmOperand(TII, *Src1)) { unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; Index: test/CodeGen/AMDGPU/s_addk_i32.ll =================================================================== --- test/CodeGen/AMDGPU/s_addk_i32.ll +++ test/CodeGen/AMDGPU/s_addk_i32.ll @@ -91,3 +91,19 @@ store i32 %add, i32 addrspace(1)* %out ret void } + +@lds = addrspace(3) global [512 x i32] undef, align 4 + +; SI-LABEL: {{^}}commute_s_addk_i32: +; SI: s_addk_i32 s{{[0-9]+}}, 0x800{{$}} +define void @commute_s_addk_i32(i32 addrspace(1)* %out, i32 %b) #0 { + %size = call i32 @llvm.amdgcn.groupstaticsize() + %add = add i32 %size, %b + call void asm sideeffect "; foo $0, $1", "v,s"([512 x i32] addrspace(3)* @lds, i32 %add) + ret void +} + +declare i32 @llvm.amdgcn.groupstaticsize() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/s_mulk_i32.ll =================================================================== --- test/CodeGen/AMDGPU/s_mulk_i32.ll +++ test/CodeGen/AMDGPU/s_mulk_i32.ll @@ -39,3 +39,19 @@ store i32 %mul, i32 addrspace(1)* %out ret void } + +@lds = addrspace(3) global [512 x i32] undef, align 4 + +; SI-LABEL: {{^}}commute_s_mulk_i32: +; SI: s_mulk_i32 s{{[0-9]+}}, 0x800{{$}} +define void @commute_s_mulk_i32(i32 addrspace(1)* %out, i32 %b) #0 { + %size = call i32 @llvm.amdgcn.groupstaticsize() + %add = mul i32 %size, %b + call void asm sideeffect "; foo $0, $1", "v,s"([512 x i32] addrspace(3)* @lds, i32 %add) + ret void +} + +declare i32 @llvm.amdgcn.groupstaticsize() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone }