Index: llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -59,6 +59,18 @@ } else ExtReg = extendRegister(ValVReg, VA); + // If this is a scalar return, insert a readfirstlane just in case the value + // ends up in a VGPR. + // FIXME: Assert this is a shader return. + const SIRegisterInfo *TRI + = static_cast(MRI.getTargetRegisterInfo()); + if (TRI->isSGPRReg(MRI, PhysReg)) { + auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, + {MRI.getType(ExtReg)}, false) + .addReg(ExtReg); + ExtReg = ToSGPR.getReg(0); + } + MIRBuilder.buildCopy(PhysReg, ExtReg); MIB.addUse(PhysReg, RegState::Implicit); } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll @@ -29,8 +29,7 @@ ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog %bswap = call i32 @llvm.bswap.i32(i32 %src) - %to.sgpr = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap) - ret i32 %to.sgpr + ret i32 %bswap } define i32 @v_bswap_i32(i32 %src) { @@ -96,13 +95,7 @@ ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %src) - %bswap.0 = extractelement <2 x i32> %bswap, i32 0 - %bswap.1 = extractelement <2 x i32> %bswap, i32 1 - %to.sgpr0 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.0) - %to.sgpr1 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.1) - %ins.0 = insertelement <2 x i32> undef, i32 %to.sgpr0, i32 0 - %ins.1 = insertelement <2 x i32> %ins.0, i32 %to.sgpr1, i32 1 - ret <2 x i32> %ins.1 + ret <2 x i32> %bswap } define <2 x i32> @v_bswap_v2i32(<2 x i32> %src) { @@ -137,7 +130,7 @@ ret <2 x i32> %bswap } -define amdgpu_ps <2 x i32> @s_bswap_i64(i64 inreg %src) { +define amdgpu_ps i64 @s_bswap_i64(i64 inreg %src) { ; GFX7-LABEL: s_bswap_i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_alignbit_b32 v0, s1, s1, 8 @@ -173,14 +166,7 @@ ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog %bswap = call i64 @llvm.bswap.i64(i64 %src) - %cast = bitcast i64 %bswap to <2 x i32> - %elt0 = extractelement <2 x i32> %cast, i32 0 - %elt1 = extractelement <2 x i32> %cast, i32 1 - %to.sgpr0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0) - %to.sgpr1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1) - %ins.0 = insertelement <2 x i32> undef, i32 %to.sgpr0, i32 0 - %ins.1 = insertelement <2 x i32> %ins.0, i32 %to.sgpr1, i32 1 - ret <2 x i32> %ins.1 + ret i64 %bswap } define i64 @v_bswap_i64(i64 %src) { @@ -218,7 +204,7 @@ ret i64 %bswap } -define amdgpu_ps <4 x i32> @s_bswap_v2i64(<2 x i64> inreg %src) { +define amdgpu_ps <2 x i64> @s_bswap_v2i64(<2 x i64> inreg %src) { ; GFX7-LABEL: s_bswap_v2i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_alignbit_b32 v0, s1, s1, 8 @@ -274,20 +260,7 @@ ; GFX9-NEXT: v_readfirstlane_b32 s3, v3 ; GFX9-NEXT: ; return to shader part epilog %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %src) - %cast = bitcast <2 x i64> %bswap to <4 x i32> - %bswap.0 = extractelement <4 x i32> %cast, i32 0 - %bswap.1 = extractelement <4 x i32> %cast, i32 1 - %bswap.2 = extractelement <4 x i32> %cast, i32 2 - %bswap.3 = extractelement <4 x i32> %cast, i32 3 - %to.sgpr0 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.0) - %to.sgpr1 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.1) - %to.sgpr2 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.2) - %to.sgpr3 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.3) - %ins.0 = insertelement <4 x i32> undef, i32 %to.sgpr0, i32 0 - %ins.1 = insertelement <4 x i32> %ins.0, i32 %to.sgpr1, i32 1 - %ins.2 = insertelement <4 x i32> %ins.1, i32 %to.sgpr2, i32 2 - %ins.3 = insertelement <4 x i32> %ins.2, i32 %to.sgpr3, i32 3 - ret <4 x i32> %ins.3 + ret <2 x i64> %bswap } define <2 x i64> @v_bswap_v2i64(<2 x i64> %src) { @@ -345,7 +318,6 @@ ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: s_lshr_b32 s0, s0, 8 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_bswap_i16: @@ -364,10 +336,7 @@ ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog %bswap = call i16 @llvm.bswap.i16(i16 %src) - %zext = zext i16 %bswap to i32 - %to.sgpr = call i32 @llvm.amdgcn.readfirstlane(i32 %zext) - %trunc = trunc i32 %to.sgpr to i16 - ret i16 %trunc + ret i16 %bswap } define i16 @v_bswap_i16(i16 %src) { @@ -431,9 +400,8 @@ ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog %bswap = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %src) - %cast0 = bitcast <2 x i16> %bswap to i32 - %to.sgpr = call i32 @llvm.amdgcn.readfirstlane(i32 %cast0) - ret i32 %to.sgpr + %cast = bitcast <2 x i16> %bswap to i32 + ret i32 %cast } define i32 @v_bswap_i16_zext_to_i32(i16 %src) { @@ -574,7 +542,6 @@ ret i64 %zext } -declare i32 @llvm.amdgcn.readfirstlane(i32) #0 declare i16 @llvm.bswap.i16(i16) #1 declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>) #1 declare <3 x i16> @llvm.bswap.v3i16(<3 x i16>) #1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll @@ -31,6 +31,77 @@ ret void } +define amdgpu_ps float @vgpr_return(i32 %vgpr) { + ; CHECK-LABEL: name: vgpr_return + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: $vgpr0 = COPY [[COPY]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %cast = bitcast i32 %vgpr to float + ret float %cast +} + +define amdgpu_ps i32 @sgpr_return_i32(i32 %vgpr) { + ; CHECK-LABEL: name: sgpr_return_i32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY]](s32) + ; CHECK: $sgpr0 = COPY [[INT]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0 + ret i32 %vgpr +} + +define amdgpu_ps i64 @sgpr_return_i64(i64 %vgpr) { + ; CHECK-LABEL: name: sgpr_return_i64 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64) + ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32) + ; CHECK: $sgpr0 = COPY [[INT]](s32) + ; CHECK: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32) + ; CHECK: $sgpr1 = COPY [[INT1]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ret i64 %vgpr +} + +define amdgpu_ps <2 x i32> @sgpr_return_v2i32(<2 x i32> %vgpr) { + ; CHECK-LABEL: name: sgpr_return_v2i32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32) + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s32>) + ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32) + ; CHECK: $sgpr0 = COPY [[INT]](s32) + ; CHECK: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32) + ; CHECK: $sgpr1 = COPY [[INT1]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ret <2 x i32> %vgpr +} + +define amdgpu_ps { i32, i32 } @sgpr_struct_return_i32_i32(i32 %vgpr0, i32 %vgpr1) { + ; CHECK-LABEL: name: sgpr_struct_return_i32_i32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY]](s32) + ; CHECK: $sgpr0 = COPY [[INT]](s32) + ; CHECK: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY1]](s32) + ; CHECK: $sgpr1 = COPY [[INT1]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + %insertvalue0 = insertvalue { i32, i32 } undef, i32 %vgpr0, 0 + %value = insertvalue { i32, i32 } %insertvalue0, i32 %vgpr1, 1 + ret { i32, i32 } %value +} + declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) #0 attributes #0 = { nounwind }