Index: clang/include/clang/Basic/BuiltinsAMDGPU.def =================================================================== --- clang/include/clang/Basic/BuiltinsAMDGPU.def +++ clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -103,12 +103,6 @@ BUILTIN(__builtin_amdgcn_s_sleep, "vIi", "n") BUILTIN(__builtin_amdgcn_s_incperflevel, "vIi", "n") BUILTIN(__builtin_amdgcn_s_decperflevel, "vIi", "n") -BUILTIN(__builtin_amdgcn_uicmp, "LUiUiUiIi", "nc") -BUILTIN(__builtin_amdgcn_uicmpl, "LUiLUiLUiIi", "nc") -BUILTIN(__builtin_amdgcn_sicmp, "LUiiiIi", "nc") -BUILTIN(__builtin_amdgcn_sicmpl, "LUiLiLiIi", "nc") -BUILTIN(__builtin_amdgcn_fcmp, "LUiddIi", "nc") -BUILTIN(__builtin_amdgcn_fcmpf, "LUiffIi", "nc") BUILTIN(__builtin_amdgcn_ds_swizzle, "iiIi", "nc") BUILTIN(__builtin_amdgcn_ds_permute, "iii", "nc") BUILTIN(__builtin_amdgcn_ds_bpermute, "iii", "nc") @@ -138,6 +132,21 @@ BUILTIN(__builtin_amdgcn_mqsad_pk_u16_u8, "LUiLUiUiLUi", "nc") BUILTIN(__builtin_amdgcn_mqsad_u32_u8, "V4UiLUiUiV4Ui", "nc") +//===----------------------------------------------------------------------===// +// Ballot builtins. +//===----------------------------------------------------------------------===// + +TARGET_BUILTIN(__builtin_amdgcn_ballot_wave32, "Uib", "nc", "wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_ballot_wave64, "LUib", "nc", "wavefrontsize64") + +// Deprecated intrinsics in favor of __builtin_amdgn_ballot_{wave32|wave64} +BUILTIN(__builtin_amdgcn_uicmp, "LUiUiUiIi", "nc") +BUILTIN(__builtin_amdgcn_uicmpl, "LUiLUiLUiIi", "nc") +BUILTIN(__builtin_amdgcn_sicmp, "LUiiiIi", "nc") +BUILTIN(__builtin_amdgcn_sicmpl, "LUiLiLiIi", "nc") +BUILTIN(__builtin_amdgcn_fcmp, "LUiddIi", "nc") +BUILTIN(__builtin_amdgcn_fcmpf, "LUiffIi", "nc") + //===----------------------------------------------------------------------===// // CI+ only builtins. //===----------------------------------------------------------------------===// Index: clang/lib/Basic/Targets/AMDGPU.cpp =================================================================== --- clang/lib/Basic/Targets/AMDGPU.cpp +++ clang/lib/Basic/Targets/AMDGPU.cpp @@ -164,6 +164,8 @@ bool AMDGPUTargetInfo::initFeatureMap( llvm::StringMap &Features, DiagnosticsEngine &Diags, StringRef CPU, const std::vector &FeatureVec) const { + const bool IsNullCPU = CPU.empty(); + bool IsWave32Capable = false; using namespace llvm::AMDGPU; @@ -171,6 +173,7 @@ if (isAMDGCN(getTriple())) { switch (llvm::AMDGPU::parseArchAMDGCN(CPU)) { case GK_GFX1030: + IsWave32Capable = true; Features["ci-insts"] = true; Features["dot1-insts"] = true; Features["dot2-insts"] = true; @@ -194,6 +197,7 @@ Features["dot6-insts"] = true; LLVM_FALLTHROUGH; case GK_GFX1010: + IsWave32Capable = true; Features["dl-insts"] = true; Features["ci-insts"] = true; Features["flat-address-space"] = true; @@ -276,7 +280,21 @@ } } - return TargetInfo::initFeatureMap(Features, Diags, CPU, FeatureVec); + if (!TargetInfo::initFeatureMap(Features, Diags, CPU, FeatureVec)) + return false; + + // Don't assume any wavesize with an unknown subtarget. + if (!IsNullCPU) { + // Default to wave32 if available, or wave64 if not + if (Features.count("wavefrontsize32") == 0 && + Features.count("wavefrontsize64") == 0) { + StringRef DefaultWaveSizeFeature = IsWave32Capable ? + "wavefrontsize32" : "wavefrontsize64"; + Features.insert(std::make_pair(DefaultWaveSizeFeature, true)); + } + } + + return true; } void AMDGPUTargetInfo::fillValidCPUList( Index: clang/lib/CodeGen/CGBuiltin.cpp =================================================================== --- clang/lib/CodeGen/CGBuiltin.cpp +++ clang/lib/CodeGen/CGBuiltin.cpp @@ -14517,6 +14517,13 @@ return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_ubfe); case AMDGPU::BI__builtin_amdgcn_sbfe: return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_sbfe); + case AMDGPU::BI__builtin_amdgcn_ballot_wave32: + case AMDGPU::BI__builtin_amdgcn_ballot_wave64: { + llvm::Type *ResultType = ConvertType(E->getType()); + llvm::Value *Src = EmitScalarExpr(E->getArg(0)); + Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, { ResultType }); + return Builder.CreateCall(F, { Src }); + } case AMDGPU::BI__builtin_amdgcn_uicmp: case AMDGPU::BI__builtin_amdgcn_uicmpl: case AMDGPU::BI__builtin_amdgcn_sicmp: Index: clang/test/CodeGenOpenCL/amdgpu-features.cl =================================================================== --- clang/test/CodeGenOpenCL/amdgpu-features.cl +++ clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -3,6 +3,10 @@ // Check that appropriate features are defined for every supported AMDGPU // "-target" and "-mcpu" options. +// RUN: %clang_cc1 -triple amdgcn -S -emit-llvm -o - %s | FileCheck --check-prefix=NOCPU %s +// RUN: %clang_cc1 -triple amdgcn -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck --check-prefix=NOCPU-WAVE32 %s +// RUN: %clang_cc1 -triple amdgcn -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck --check-prefix=NOCPU-WAVE64 %s + // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx600 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX600 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx601 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX601 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx700 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX700 %s @@ -15,16 +19,20 @@ // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1012 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1012 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1030 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1030 %s -// GFX600-NOT: "target-features" -// GFX601-NOT: "target-features" -// GFX700: "target-features"="+ci-insts,+flat-address-space" -// GFX801: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+s-memrealtime" -// GFX904: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime" -// GFX906: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime" -// GFX908: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+mai-insts,+s-memrealtime" -// GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime" -// GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime" -// GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime" -// GFX1030: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime" +// NOCPU-NOT: "target-features" +// NOCPU-WAVE32: "target-features"="+wavefrontsize32" +// NOCPU-WAVE64: "target-features"="+wavefrontsize64" + +// GFX600: "target-features"="+wavefrontsize64" +// GFX601: "target-features"="+wavefrontsize64" +// GFX700: "target-features"="+ci-insts,+flat-address-space,+wavefrontsize64" +// GFX801: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+s-memrealtime,+wavefrontsize64" +// GFX904: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+wavefrontsize64" +// GFX906: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+wavefrontsize64" +// GFX908: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+mai-insts,+s-memrealtime,+wavefrontsize64" +// GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+wavefrontsize32" +// GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+wavefrontsize32" +// GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+wavefrontsize32" +// GFX1030: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+wavefrontsize32" kernel void test() {} Index: clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl =================================================================== --- clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl +++ clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl @@ -22,3 +22,10 @@ void test_mov_dpp8(global uint* out, uint a) { *out = __builtin_amdgcn_mov_dpp8(a, 1); } + +// CHECK-LABEL: @test_ballot_wave32( +// CHECK: call i32 @llvm.amdgcn.ballot.i32(i1 %{{.+}}) +void test_ballot_wave32(global uint* out, int a, int b) +{ + *out = __builtin_amdgcn_ballot_wave32(a == b); +} Index: clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl =================================================================== --- /dev/null +++ clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl @@ -0,0 +1,12 @@ +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -S -emit-llvm -o - %s | FileCheck -enable-var-scope %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck -enable-var-scope %s + +typedef unsigned int uint; + +// CHECK-LABEL: @test_ballot_wave32( +// CHECK: call i32 @llvm.amdgcn.ballot.i32(i1 %{{.+}}) +void test_ballot_wave32(global uint* out, int a, int b) +{ + *out = __builtin_amdgcn_ballot_wave32(a == b); +} + Index: clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl =================================================================== --- /dev/null +++ clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl @@ -0,0 +1,13 @@ +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -S -emit-llvm -o - %s | FileCheck -enable-var-scope %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck -enable-var-scope %s +// XUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -target-feature -wavefrontsize32 -S -emit-llvm -o - %s | FileCheck -enable-var-scope %s + +typedef unsigned long ulong; + +// CHECK-LABEL: @test_ballot_wave64( +// CHECK: call i64 @llvm.amdgcn.ballot.i64(i1 %{{.+}}) +void test_ballot_wave64(global ulong* out, int a, int b) +{ + *out = __builtin_amdgcn_ballot_wave64(a == b); +} + Index: clang/test/SemaOpenCL/builtins-amdgcn-error-wave32.cl =================================================================== --- /dev/null +++ clang/test/SemaOpenCL/builtins-amdgcn-error-wave32.cl @@ -0,0 +1,9 @@ +// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx900 -verify -S -o - %s +// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1010 -target-feature +wavefrontsize64 -verify -S -o - %s +// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1010 -target-feature -wavefrontsize32 -verify -S -o - %s + +typedef unsigned int uint; + +void test_ballot_wave32(global uint* out, int a, int b) { + *out = __builtin_amdgcn_ballot_wave32(a == b); // expected-error {{'__builtin_amdgcn_ballot_wave32' needs target feature wavefrontsize32}} +} Index: clang/test/SemaOpenCL/builtins-amdgcn-error-wave64.cl =================================================================== --- /dev/null +++ clang/test/SemaOpenCL/builtins-amdgcn-error-wave64.cl @@ -0,0 +1,10 @@ +// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1010 -verify -S -o - %s +// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1010 -target-feature +wavefrontsize32 -verify -S -o - %s +// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1010 -target-feature -wavefrontsize64 -verify -S -o - %s +// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1010 -verify -S -o - %s + +typedef unsigned long ulong; + +void test_ballot_wave64(global ulong* out, int a, int b) { + *out = __builtin_amdgcn_ballot_wave64(a == b); // expected-error {{'__builtin_amdgcn_ballot_wave64' needs target feature wavefrontsize64}} +}