Index: llvm/trunk/lib/Target/AMDGPU/SIFixSGPRCopies.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -215,7 +215,7 @@ for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { unsigned SrcReg = MI.getOperand(I).getReg(); - unsigned SrcSubReg = MI.getOperand(I).getReg(); + unsigned SrcSubReg = MI.getOperand(I).getSubReg(); const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); assert(TRI->isSGPRClass(SrcRC) && Index: llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -334,13 +334,10 @@ !MRI.hasOneUse(MI.getOperand(0).getReg())) continue; - // FIXME: Fold operands with subregs. if (OpToFold.isReg() && - (!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()) || - OpToFold.getSubReg())) + !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg())) continue; - // We need mutate the operands of new mov instructions to add implicit // uses of EXEC, but adding them invalidates the use_iterator, so defer // this. Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1777,6 +1777,10 @@ MRI.getRegClass(Reg) : RI.getPhysRegClass(Reg); + const SIRegisterInfo *TRI = + static_cast(MRI.getTargetRegisterInfo()); + RC = TRI->getSubRegClass(RC, MO.getSubReg()); + // In order to be legal, the common sub-class must be equal to the // class of the current operand. For example: // Index: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -464,12 +464,38 @@ if (SubIdx == AMDGPU::NoSubRegister) return RC; - // If this register has a sub-register, we can safely assume it is a 32-bit - // register, because all of SI's sub-registers are 32-bit. + // We can assume that each lane corresponds to one 32-bit register. + unsigned Count = countPopulation(getSubRegIndexLaneMask(SubIdx)); if (isSGPRClass(RC)) { - return &AMDGPU::SGPR_32RegClass; + switch (Count) { + case 1: + return &AMDGPU::SGPR_32RegClass; + case 2: + return &AMDGPU::SReg_64RegClass; + case 4: + return &AMDGPU::SReg_128RegClass; + case 8: + return &AMDGPU::SReg_256RegClass; + case 16: /* fall-through */ + default: + llvm_unreachable("Invalid sub-register class size"); + } } else { - return &AMDGPU::VGPR_32RegClass; + switch (Count) { + case 1: + return &AMDGPU::VGPR_32RegClass; + case 2: + return &AMDGPU::VReg_64RegClass; + case 3: + return &AMDGPU::VReg_96RegClass; + case 4: + return &AMDGPU::VReg_128RegClass; + case 8: + return &AMDGPU::VReg_256RegClass; + case 16: /* fall-through */ + default: + llvm_unreachable("Invalid sub-register class size"); + } } } Index: llvm/trunk/test/CodeGen/AMDGPU/fmin_legacy.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/fmin_legacy.ll +++ llvm/trunk/test/CodeGen/AMDGPU/fmin_legacy.ll @@ -8,8 +8,8 @@ ; FUNC-LABEL: @test_fmin_legacy_f32 ; EG: MIN * -; SI-SAFE: v_min_legacy_f32_e32 -; SI-NONAN: v_min_f32_e32 +; SI-SAFE: v_min_legacy_f32_e64 +; SI-NONAN: v_min_f32_e64 define void @test_fmin_legacy_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 { %r0 = extractelement <4 x float> %reg0, i32 0 %r1 = extractelement <4 x float> %reg0, i32 1 Index: llvm/trunk/test/CodeGen/AMDGPU/fsub.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/fsub.ll +++ llvm/trunk/test/CodeGen/AMDGPU/fsub.ll @@ -32,9 +32,8 @@ ; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, -KC0[3].Z ; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, -KC0[3].Y -; FIXME: Should be using SGPR directly for first operand -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { %sub = fsub <2 x float> %a, %b store <2 x float> %sub, <2 x float> addrspace(1)* %out, align 8 @@ -60,13 +59,11 @@ ret void } -; FIXME: Should be using SGPR directly for first operand - ; FUNC-LABEL: {{^}}s_fsub_v4f32: -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; SI: s_endpgm define void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) { %result = fsub <4 x float> %a, %b Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.round.f64.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -21,7 +21,7 @@ ; SI-DAG: v_cmp_eq_i32 ; SI-DAG: s_mov_b32 [[BFIMASK:s[0-9]+]], 0x7fffffff -; SI-DAG: v_cmp_gt_i32_e32 +; SI-DAG: v_cmp_gt_i32 ; SI-DAG: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[BFIMASK]] ; SI: buffer_store_dwordx2