Index: llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -343,11 +343,6 @@ std::pair AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { switch (CC) { - case CallingConv::AMDGPU_CS: - case CallingConv::AMDGPU_KERNEL: - case CallingConv::SPIR_KERNEL: - return std::make_pair(getWavefrontSize() * 2, - std::max(getWavefrontSize() * 4, 256u)); case CallingConv::AMDGPU_VS: case CallingConv::AMDGPU_LS: case CallingConv::AMDGPU_HS: @@ -356,13 +351,12 @@ case CallingConv::AMDGPU_PS: return std::make_pair(1, getWavefrontSize()); default: - return std::make_pair(1, 16 * getWavefrontSize()); + return std::make_pair(1u, getMaxFlatWorkGroupSize()); } } std::pair AMDGPUSubtarget::getFlatWorkGroupSizes( const Function &F) const { - // FIXME: 1024 if function. // Default minimum/maximum flat work group sizes. std::pair Default = getDefaultFlatWorkGroupSize(F.getCallingConv()); Index: llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -412,7 +412,7 @@ ; OPT-LABEL: @pointer_typed_alloca( ; OPT: getelementptr inbounds [256 x i32 addrspace(1)*], [256 x i32 addrspace(1)*] addrspace(3)* @pointer_typed_alloca.A.addr, i32 0, i32 %{{[0-9]+}} ; OPT: load i32 addrspace(1)*, i32 addrspace(1)* addrspace(3)* %{{[0-9]+}}, align 4 -define amdgpu_kernel void @pointer_typed_alloca(i32 addrspace(1)* %A) { +define amdgpu_kernel void @pointer_typed_alloca(i32 addrspace(1)* %A) #1 { entry: %A.addr = alloca i32 addrspace(1)*, align 4, addrspace(5) store i32 addrspace(1)* %A, i32 addrspace(1)* addrspace(5)* %A.addr, align 4 @@ -556,7 +556,8 @@ ret void } -attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" } +attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" "amdgpu-flat-work-group-size"="1,256" } +attributes #1 = { nounwind "amdgpu-flat-work-group-size"="1,256" } ; HSAOPT: !0 = !{} ; HSAOPT: !1 = !{i32 0, i32 257} Index: llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll +++ llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll @@ -43,7 +43,7 @@ ret void } -attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" } +attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" } attributes #1 = { nounwind readnone } attributes #2 = { nounwind convergent } Index: llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll +++ llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll @@ -8,11 +8,11 @@ ; CHECK: --- ; CHECK: amdhsa.kernels: -; CHECK: - .args: +; CHECK: - .args: ; CHECK: .group_segment_fixed_size: 0 ; CHECK: .kernarg_segment_align: 8 ; CHECK: .kernarg_segment_size: 24 -; CHECK: .max_flat_workgroup_size: 256 +; CHECK: .max_flat_workgroup_size: 1024 ; CHECK: .name: test ; CHECK: .private_segment_fixed_size: 0 ; WAVE64: .sgpr_count: 8 @@ -33,6 +33,20 @@ ret void } +; CHECK: - .args: +; CHECK: .max_flat_workgroup_size: 256 +define amdgpu_kernel void @test_max_flat_workgroup_size( + half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) #2 { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %r.val = fadd half %a.val, %b.val + store half %r.val, half addrspace(1)* %r + ret void +} + ; CHECK: .name: num_spilled_sgprs ; GFX700: .sgpr_spill_count: 40 ; GFX803: .sgpr_spill_count: 24 @@ -149,3 +163,4 @@ attributes #0 = { "amdgpu-num-sgpr"="14" } attributes #1 = { "amdgpu-num-vgpr"="20" } +attributes #2 = { "amdgpu-flat-work-group-size"="1,256" } Index: llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll +++ llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll @@ -18,7 +18,7 @@ ; CHECK: WavefrontSize: 64 ; CHECK: NumSGPRs: 8 ; CHECK: NumVGPRs: 6 -; CHECK: MaxFlatWorkGroupSize: 256 +; CHECK: MaxFlatWorkGroupSize: 1024 define amdgpu_kernel void @test( half addrspace(1)* %r, half addrspace(1)* %a, @@ -31,6 +31,29 @@ ret void } +; CHECK-LABEL: - Name: test_max_flat_workgroup_size +; CHECK: SymbolName: 'test_max_flat_workgroup_size@kd' +; CHECK: CodeProps: +; CHECK: KernargSegmentSize: 24 +; CHECK: GroupSegmentFixedSize: 0 +; CHECK: PrivateSegmentFixedSize: 0 +; CHECK: KernargSegmentAlign: 8 +; CHECK: WavefrontSize: 64 +; CHECK: NumSGPRs: 8 +; CHECK: NumVGPRs: 6 +; CHECK: MaxFlatWorkGroupSize: 256 +define amdgpu_kernel void @test_max_flat_workgroup_size( + half addrspace(1)* %r, + half addrspace(1)* %a, + half addrspace(1)* %b) #2 { +entry: + %a.val = load half, half addrspace(1)* %a + %b.val = load half, half addrspace(1)* %b + %r.val = fadd half %a.val, %b.val + store half %r.val, half addrspace(1)* %r + ret void +} + ; CHECK-LABEL: - Name: num_spilled_sgprs ; CHECK: SymbolName: 'num_spilled_sgprs@kd' ; CHECK: CodeProps: @@ -144,3 +167,4 @@ attributes #0 = { "amdgpu-num-sgpr"="14" } attributes #1 = { "amdgpu-num-vgpr"="20" } +attributes #2 = { "amdgpu-flat-work-group-size"="1,256" } Index: llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll +++ llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll @@ -39,7 +39,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 -attributes #0 = { norecurse nounwind } +attributes #0 = { norecurse nounwind "amdgpu-flat-work-group-size"="1,256" } attributes #1 = { nounwind readnone } !0 = !{i32 0, i32 1024} Index: llvm/test/CodeGen/AMDGPU/occupancy-levels.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/occupancy-levels.ll +++ llvm/test/CodeGen/AMDGPU/occupancy-levels.ll @@ -262,8 +262,8 @@ } ; GCN-LABEL: {{^}}used_lds_6556: -; GFX9: ; Occupancy: 9 -; GFX1010W64: ; Occupancy: 19 +; GFX9: ; Occupancy: 10 +; GFX1010W64: ; Occupancy: 20 ; GFX1010W32: ; Occupancy: 20 @lds6556 = internal addrspace(3) global [6556 x i8] undef, align 4 define amdgpu_kernel void @used_lds_6556() { @@ -273,9 +273,9 @@ } ; GCN-LABEL: {{^}}used_lds_13112: -; GFX9: ; Occupancy: 4 -; GFX1010W64: ; Occupancy: 9 -; GFX1010W32: ; Occupancy: 19 +; GFX9: ; Occupancy: 10 +; GFX1010W64: ; Occupancy: 20 +; GFX1010W32: ; Occupancy: 20 @lds13112 = internal addrspace(3) global [13112 x i8] undef, align 4 define amdgpu_kernel void @used_lds_13112() { %p = bitcast [13112 x i8] addrspace(3)* @lds13112 to i8 addrspace(3)* Index: llvm/test/CodeGen/AMDGPU/private-memory-r600.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/private-memory-r600.ll +++ llvm/test/CodeGen/AMDGPU/private-memory-r600.ll @@ -300,4 +300,4 @@ ; OPT: !0 = !{i32 0, i32 257} ; OPT: !1 = !{i32 0, i32 256} -attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" } +attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" "amdgpu-flat-work-group-size"="1,256" } Index: llvm/test/CodeGen/AMDGPU/promote-alloca-addrspacecast.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/promote-alloca-addrspacecast.ll +++ llvm/test/CodeGen/AMDGPU/promote-alloca-addrspacecast.ll @@ -18,4 +18,4 @@ ret void } -attributes #0 = { nounwind } +attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,256" } Index: llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll +++ llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll @@ -64,4 +64,4 @@ declare i32* @get_unknown_pointer() #0 -attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" } +attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" } Index: llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll +++ llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll @@ -201,4 +201,4 @@ declare i32* @get_unknown_pointer() #0 -attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" } +attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" } Index: llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll +++ llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll @@ -131,5 +131,5 @@ ret void } -attributes #0 = { norecurse nounwind "amdgpu-waves-per-eu"="1,1" } +attributes #0 = { norecurse nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" } attributes #1 = { norecurse nounwind }