Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -289,6 +289,10 @@ return getGeneration() >= GFX9; } + bool hasMin3Max3_16() const { + return getGeneration() >= GFX9; + } + bool hasCARRY() const { return (getGeneration() >= EVERGREEN); } Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -4989,7 +4989,8 @@ if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY && - VT != MVT::f64) { + VT != MVT::f64 && + ((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) { // max(max(a, b), c) -> max3(a, b, c) // min(min(a, b), c) -> min3(a, b, c) if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { Index: lib/Target/AMDGPU/VOP3Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP3Instructions.td +++ lib/Target/AMDGPU/VOP3Instructions.td @@ -301,10 +301,19 @@ def V_OR3_B32 : VOP3Inst <"v_or3_b32", VOP3_Profile>; def V_XAD_U32 : VOP3Inst <"v_xad_u32", VOP3_Profile>; + def V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile, AMDGPUfmed3>; def V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile, AMDGPUsmed3>; def V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile, AMDGPUumed3>; -} + +def V_MIN3_F16 : VOP3Inst <"v_min3_f16", VOP3_Profile, AMDGPUfmin3>; +def V_MIN3_I16 : VOP3Inst <"v_min3_i16", VOP3_Profile, AMDGPUsmin3>; +def V_MIN3_U16 : VOP3Inst <"v_min3_u16", VOP3_Profile, AMDGPUumin3>; + +def V_MAX3_F16 : VOP3Inst <"v_max3_f16", VOP3_Profile, AMDGPUfmax3>; +def V_MAX3_I16 : VOP3Inst <"v_max3_i16", VOP3_Profile, AMDGPUsmax3>; +def V_MAX3_U16 : VOP3Inst <"v_max3_u16", VOP3_Profile, AMDGPUumax3>; +} // End SubtargetPredicate = isGFX9 //===----------------------------------------------------------------------===// @@ -512,6 +521,15 @@ defm V_PACK_B32_F16 : VOP3_Real_vi <0x2a0>; defm V_XAD_U32 : VOP3_Real_vi <0x1f3>; + +defm V_MIN3_F16 : VOP3_Real_vi <0x1f4>; +defm V_MIN3_I16 : VOP3_Real_vi <0x1f5>; +defm V_MIN3_U16 : VOP3_Real_vi <0x1f6>; + +defm V_MAX3_F16 : VOP3_Real_vi <0x1f7>; +defm V_MAX3_I16 : VOP3_Real_vi <0x1f8>; +defm V_MAX3_U16 : VOP3_Real_vi <0x1f9>; + defm V_MED3_F16 : VOP3_Real_vi <0x1fa>; defm V_MED3_I16 : VOP3_Real_vi <0x1fb>; defm V_MED3_U16 : VOP3_Real_vi <0x1fc>; Index: test/CodeGen/AMDGPU/fmax3.ll =================================================================== --- test/CodeGen/AMDGPU/fmax3.ll +++ test/CodeGen/AMDGPU/fmax3.ll @@ -1,39 +1,92 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare float @llvm.maxnum.f32(float, float) nounwind readnone - -; SI-LABEL: {{^}}test_fmax3_olt_0: -; SI: buffer_load_dword [[REGC:v[0-9]+]] -; SI: buffer_load_dword [[REGB:v[0-9]+]] -; SI: buffer_load_dword [[REGA:v[0-9]+]] -; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define amdgpu_kernel void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s + +; GCN-LABEL: {{^}}test_fmax3_olt_0_f32: +; GCN: buffer_load_dword [[REGC:v[0-9]+]] +; GCN: buffer_load_dword [[REGB:v[0-9]+]] +; GCN: buffer_load_dword [[REGA:v[0-9]+]] +; GCN: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm +define amdgpu_kernel void @test_fmax3_olt_0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #0 { %a = load volatile float, float addrspace(1)* %aptr, align 4 %b = load volatile float, float addrspace(1)* %bptr, align 4 %c = load volatile float, float addrspace(1)* %cptr, align 4 - %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone - %f1 = call float @llvm.maxnum.f32(float %f0, float %c) nounwind readnone + %f0 = call float @llvm.maxnum.f32(float %a, float %b) + %f1 = call float @llvm.maxnum.f32(float %f0, float %c) store float %f1, float addrspace(1)* %out, align 4 ret void } ; Commute operand of second fmax -; SI-LABEL: {{^}}test_fmax3_olt_1: -; SI: buffer_load_dword [[REGB:v[0-9]+]] -; SI: buffer_load_dword [[REGA:v[0-9]+]] -; SI: buffer_load_dword [[REGC:v[0-9]+]] -; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define amdgpu_kernel void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { +; GCN-LABEL: {{^}}test_fmax3_olt_1_f32: +; GCN: buffer_load_dword [[REGB:v[0-9]+]] +; GCN: buffer_load_dword [[REGA:v[0-9]+]] +; GCN: buffer_load_dword [[REGC:v[0-9]+]] +; GCN: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm +define amdgpu_kernel void @test_fmax3_olt_1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #0 { %a = load volatile float, float addrspace(1)* %aptr, align 4 %b = load volatile float, float addrspace(1)* %bptr, align 4 %c = load volatile float, float addrspace(1)* %cptr, align 4 - %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone - %f1 = call float @llvm.maxnum.f32(float %c, float %f0) nounwind readnone + %f0 = call float @llvm.maxnum.f32(float %a, float %b) + %f1 = call float @llvm.maxnum.f32(float %c, float %f0) store float %f1, float addrspace(1)* %out, align 4 ret void } + +; GCN-LABEL: {{^}}test_fmax3_olt_0_f16: +; GCN: buffer_load_ushort [[REGC:v[0-9]+]] +; GCN: buffer_load_ushort [[REGB:v[0-9]+]] +; GCN: buffer_load_ushort [[REGA:v[0-9]+]] + +; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]], +; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT]] + +; VI: v_max_f16_e32 +; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], + +; GFX9: v_max3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; GCN: buffer_store_short [[RESULT]], +define amdgpu_kernel void @test_fmax3_olt_0_f16(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #0 { + %a = load volatile half, half addrspace(1)* %aptr, align 2 + %b = load volatile half, half addrspace(1)* %bptr, align 2 + %c = load volatile half, half addrspace(1)* %cptr, align 2 + %f0 = call half @llvm.maxnum.f16(half %a, half %b) + %f1 = call half @llvm.maxnum.f16(half %f0, half %c) + store half %f1, half addrspace(1)* %out, align 2 + ret void +} + +; Commute operand of second fmax +; GCN-LABEL: {{^}}test_fmax3_olt_1_f16: +; GCN: buffer_load_ushort [[REGB:v[0-9]+]] +; GCN: buffer_load_ushort [[REGA:v[0-9]+]] +; GCN: buffer_load_ushort [[REGC:v[0-9]+]] + +; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]], +; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT]] + +; VI: v_max_f16_e32 +; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], + +; GFX9: v_max3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; GCN: buffer_store_short [[RESULT]], +define amdgpu_kernel void @test_fmax3_olt_1_f16(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #0 { + %a = load volatile half, half addrspace(1)* %aptr, align 2 + %b = load volatile half, half addrspace(1)* %bptr, align 2 + %c = load volatile half, half addrspace(1)* %cptr, align 2 + %f0 = call half @llvm.maxnum.f16(half %a, half %b) + %f1 = call half @llvm.maxnum.f16(half %c, half %f0) + store half %f1, half addrspace(1)* %out, align 2 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare float @llvm.maxnum.f32(float, float) #1 +declare half @llvm.maxnum.f16(half, half) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone speculatable } Index: test/CodeGen/AMDGPU/fmin3.ll =================================================================== --- test/CodeGen/AMDGPU/fmin3.ll +++ test/CodeGen/AMDGPU/fmin3.ll @@ -1,40 +1,90 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare float @llvm.minnum.f32(float, float) nounwind readnone - -; SI-LABEL: {{^}}test_fmin3_olt_0: -; SI: buffer_load_dword [[REGC:v[0-9]+]] -; SI: buffer_load_dword [[REGB:v[0-9]+]] -; SI: buffer_load_dword [[REGA:v[0-9]+]] -; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define amdgpu_kernel void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s + +; GCN-LABEL: {{^}}test_fmin3_olt_0_f32: +; GCN: buffer_load_dword [[REGC:v[0-9]+]] +; GCN: buffer_load_dword [[REGB:v[0-9]+]] +; GCN: buffer_load_dword [[REGA:v[0-9]+]] +; GCN: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; GCN: buffer_store_dword [[RESULT]], +define amdgpu_kernel void @test_fmin3_olt_0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #0 { %a = load volatile float, float addrspace(1)* %aptr, align 4 %b = load volatile float, float addrspace(1)* %bptr, align 4 %c = load volatile float, float addrspace(1)* %cptr, align 4 - %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone - %f1 = call float @llvm.minnum.f32(float %f0, float %c) nounwind readnone + %f0 = call float @llvm.minnum.f32(float %a, float %b) + %f1 = call float @llvm.minnum.f32(float %f0, float %c) store float %f1, float addrspace(1)* %out, align 4 ret void } ; Commute operand of second fmin -; SI-LABEL: {{^}}test_fmin3_olt_1: -; SI: buffer_load_dword [[REGB:v[0-9]+]] -; SI: buffer_load_dword [[REGA:v[0-9]+]] -; SI: buffer_load_dword [[REGC:v[0-9]+]] -; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define amdgpu_kernel void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { +; GCN-LABEL: {{^}}test_fmin3_olt_1_f32: +; GCN: buffer_load_dword [[REGB:v[0-9]+]] +; GCN: buffer_load_dword [[REGA:v[0-9]+]] +; GCN: buffer_load_dword [[REGC:v[0-9]+]] +; GCN: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; GCN: buffer_store_dword [[RESULT]], +define amdgpu_kernel void @test_fmin3_olt_1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #0 { %a = load volatile float, float addrspace(1)* %aptr, align 4 %b = load volatile float, float addrspace(1)* %bptr, align 4 %c = load volatile float, float addrspace(1)* %cptr, align 4 - %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone - %f1 = call float @llvm.minnum.f32(float %c, float %f0) nounwind readnone + %f0 = call float @llvm.minnum.f32(float %a, float %b) + %f1 = call float @llvm.minnum.f32(float %c, float %f0) store float %f1, float addrspace(1)* %out, align 4 ret void } + +; GCN-LABEL: {{^}}test_fmin3_olt_0_f16: +; GCN: buffer_load_ushort [[REGC:v[0-9]+]] +; GCN: buffer_load_ushort [[REGB:v[0-9]+]] +; GCN: buffer_load_ushort [[REGA:v[0-9]+]] + +; SI: v_min3_f32 [[RESULT_F32:v[0-9]+]], +; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT]] + +; VI: v_min_f16_e32 +; VI: v_min_f16_e32 [[RESULT:v[0-9]+]], + +; GFX9: v_min3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; GCN: buffer_store_short [[RESULT]], +define amdgpu_kernel void @test_fmin3_olt_0_f16(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #0 { + %a = load volatile half, half addrspace(1)* %aptr, align 2 + %b = load volatile half, half addrspace(1)* %bptr, align 2 + %c = load volatile half, half addrspace(1)* %cptr, align 2 + %f0 = call half @llvm.minnum.f16(half %a, half %b) + %f1 = call half @llvm.minnum.f16(half %f0, half %c) + store half %f1, half addrspace(1)* %out, align 2 + ret void +} + +; Commute operand of second fmin +; GCN-LABEL: {{^}}test_fmin3_olt_1_f16: +; GCN: buffer_load_ushort [[REGB:v[0-9]+]] +; GCN: buffer_load_ushort [[REGA:v[0-9]+]] +; GCN: buffer_load_ushort [[REGC:v[0-9]+]] + +; SI: v_min3_f32 [[RESULT_F32:v[0-9]+]], +; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT]] + +; VI: v_min_f16_e32 +; VI: v_min_f16_e32 [[RESULT:v[0-9]+]], + +; GFX9: v_min3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; GCN: buffer_store_short [[RESULT]], +define amdgpu_kernel void @test_fmin3_olt_1_f16(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #0 { + %a = load volatile half, half addrspace(1)* %aptr, align 2 + %b = load volatile half, half addrspace(1)* %bptr, align 2 + %c = load volatile half, half addrspace(1)* %cptr, align 2 + %f0 = call half @llvm.minnum.f16(half %a, half %b) + %f1 = call half @llvm.minnum.f16(half %c, half %f0) + store half %f1, half addrspace(1)* %out, align 2 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare float @llvm.minnum.f32(float, float) #1 +declare half @llvm.minnum.f16(half, half) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone speculatable } Index: test/CodeGen/AMDGPU/max3.ll =================================================================== --- test/CodeGen/AMDGPU/max3.ll +++ test/CodeGen/AMDGPU/max3.ll @@ -1,41 +1,94 @@ -; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn < %s | FileCheck -check-prefixes=GCN,SI %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - -; FUNC-LABEL: @v_test_imax3_sgt_i32 -; SI: v_max3_i32 -define amdgpu_kernel void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone +; GCN-LABEL: {{^}}v_test_imax3_sgt_i32: +; GCN: v_max3_i32 +define amdgpu_kernel void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %c = load i32, i32 addrspace(1)* %gep2, align 4 + %a = load i32, i32 addrspace(1)* %gep0 + %b = load i32, i32 addrspace(1)* %gep1 + %c = load i32, i32 addrspace(1)* %gep2 %icmp0 = icmp sgt i32 %a, %b %i0 = select i1 %icmp0, i32 %a, i32 %b %icmp1 = icmp sgt i32 %i0, %c %i1 = select i1 %icmp1, i32 %i0, i32 %c - store i32 %i1, i32 addrspace(1)* %out, align 4 + store i32 %i1, i32 addrspace(1)* %out ret void } -; FUNC-LABEL: @v_test_umax3_ugt_i32 -; SI: v_max3_u32 -define amdgpu_kernel void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone +; GCN-LABEL: {{^}}v_test_umax3_ugt_i32: +; GCN: v_max3_u32 +define amdgpu_kernel void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %c = load i32, i32 addrspace(1)* %gep2, align 4 + %a = load i32, i32 addrspace(1)* %gep0 + %b = load i32, i32 addrspace(1)* %gep1 + %c = load i32, i32 addrspace(1)* %gep2 %icmp0 = icmp ugt i32 %a, %b %i0 = select i1 %icmp0, i32 %a, i32 %b %icmp1 = icmp ugt i32 %i0, %c %i1 = select i1 %icmp1, i32 %i0, i32 %c - store i32 %i1, i32 addrspace(1)* %out, align 4 + store i32 %i1, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_imax3_sgt_i16: +; SI: v_max3_i32 + +; VI: v_max_i16 +; VI: v_max_i16 + +; GFX9: v_max3_i16 +define amdgpu_kernel void @v_test_imax3_sgt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 addrspace(1)* %cptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr i16, i16 addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid + %a = load i16, i16 addrspace(1)* %gep0 + %b = load i16, i16 addrspace(1)* %gep1 + %c = load i16, i16 addrspace(1)* %gep2 + %icmp0 = icmp sgt i16 %a, %b + %i0 = select i1 %icmp0, i16 %a, i16 %b + %icmp1 = icmp sgt i16 %i0, %c + %i1 = select i1 %icmp1, i16 %i0, i16 %c + store i16 %i1, i16 addrspace(1)* %out ret void } + +; GCN-LABEL: {{^}}v_test_umax3_ugt_i16: +; SI: v_max3_u32 + +; VI: v_max_u16 +; VI: v_max_u16 + +; GFX9: v_max3_u16 +define amdgpu_kernel void @v_test_umax3_ugt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 addrspace(1)* %cptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr i16, i16 addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid + %a = load i16, i16 addrspace(1)* %gep0 + %b = load i16, i16 addrspace(1)* %gep1 + %c = load i16, i16 addrspace(1)* %gep2 + %icmp0 = icmp ugt i16 %a, %b + %i0 = select i1 %icmp0, i16 %a, i16 %b + %icmp1 = icmp ugt i16 %i0, %c + %i1 = select i1 %icmp1, i16 %i0, i16 %c + store i16 %i1, i16 addrspace(1)* %out + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone speculatable } Index: test/CodeGen/AMDGPU/min3.ll =================================================================== --- test/CodeGen/AMDGPU/min3.ll +++ test/CodeGen/AMDGPU/min3.ll @@ -1,50 +1,50 @@ -; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - -; FUNC-LABEL: @v_test_imin3_slt_i32 -; SI: v_min3_i32 -define amdgpu_kernel void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone +; RUN: llc -march=amdgcn < %s | FileCheck -check-prefixes=GCN,SI %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s + +; GCN-LABEL: {{^}}v_test_imin3_slt_i32: +; GCN: v_min3_i32 +define amdgpu_kernel void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %c = load i32, i32 addrspace(1)* %gep2, align 4 + %a = load i32, i32 addrspace(1)* %gep0 + %b = load i32, i32 addrspace(1)* %gep1 + %c = load i32, i32 addrspace(1)* %gep2 %icmp0 = icmp slt i32 %a, %b %i0 = select i1 %icmp0, i32 %a, i32 %b %icmp1 = icmp slt i32 %i0, %c %i1 = select i1 %icmp1, i32 %i0, i32 %c - store i32 %i1, i32 addrspace(1)* %outgep, align 4 + store i32 %i1, i32 addrspace(1)* %outgep ret void } -; FUNC-LABEL: @v_test_umin3_ult_i32 -; SI: v_min3_u32 -define amdgpu_kernel void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone +; GCN-LABEL: {{^}}v_test_umin3_ult_i32: +; GCN: v_min3_u32 +define amdgpu_kernel void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %c = load i32, i32 addrspace(1)* %gep2, align 4 + %a = load i32, i32 addrspace(1)* %gep0 + %b = load i32, i32 addrspace(1)* %gep1 + %c = load i32, i32 addrspace(1)* %gep2 %icmp0 = icmp ult i32 %a, %b %i0 = select i1 %icmp0, i32 %a, i32 %b %icmp1 = icmp ult i32 %i0, %c %i1 = select i1 %icmp1, i32 %i0, i32 %c - store i32 %i1, i32 addrspace(1)* %outgep, align 4 + store i32 %i1, i32 addrspace(1)* %outgep ret void } -; FUNC-LABEL: @v_test_umin_umin_umin -; SI: v_min_i32 -; SI: v_min3_i32 -define amdgpu_kernel void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone +; GCN-LABEL: {{^}}v_test_umin_umin_umin: +; GCN: v_min_i32 +; GCN: v_min3_i32 +define amdgpu_kernel void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid2 = mul i32 %tid, 2 %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid @@ -57,10 +57,10 @@ %outgep0 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid %outgep1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid2 - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %c = load i32, i32 addrspace(1)* %gep2, align 4 - %d = load i32, i32 addrspace(1)* %gep3, align 4 + %a = load i32, i32 addrspace(1)* %gep0 + %b = load i32, i32 addrspace(1)* %gep1 + %c = load i32, i32 addrspace(1)* %gep2 + %d = load i32, i32 addrspace(1)* %gep3 %icmp0 = icmp slt i32 %a, %b %i0 = select i1 %icmp0, i32 %a, i32 %b @@ -71,14 +71,14 @@ %icmp2 = icmp slt i32 %i0, %i1 %i2 = select i1 %icmp2, i32 %i0, i32 %i1 - store i32 %i2, i32 addrspace(1)* %outgep1, align 4 + store i32 %i2, i32 addrspace(1)* %outgep1 ret void } -; FUNC-LABEL: @v_test_umin3_2_uses -; SI-NOT: v_min3 -define amdgpu_kernel void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone +; GCN-LABEL: {{^}}v_test_umin3_2_uses: +; GCN-NOT: v_min3 +define amdgpu_kernel void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid2 = mul i32 %tid, 2 %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid @@ -91,10 +91,10 @@ %outgep0 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid %outgep1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid2 - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %c = load i32, i32 addrspace(1)* %gep2, align 4 - %d = load i32, i32 addrspace(1)* %gep3, align 4 + %a = load i32, i32 addrspace(1)* %gep0 + %b = load i32, i32 addrspace(1)* %gep1 + %c = load i32, i32 addrspace(1)* %gep2 + %d = load i32, i32 addrspace(1)* %gep3 %icmp0 = icmp slt i32 %a, %b %i0 = select i1 %icmp0, i32 %a, i32 %b @@ -105,7 +105,60 @@ %icmp2 = icmp slt i32 %i0, %c %i2 = select i1 %icmp2, i32 %i0, i32 %c - store i32 %i2, i32 addrspace(1)* %outgep0, align 4 - store i32 %i0, i32 addrspace(1)* %outgep1, align 4 + store i32 %i2, i32 addrspace(1)* %outgep0 + store i32 %i0, i32 addrspace(1)* %outgep1 ret void } + +; GCN-LABEL: {{^}}v_test_imin3_slt_i16: +; SI: v_min3_i32 + +; VI: v_min_i16 +; VI: v_min_i16 + +; GFX9: v_min3_i16 +define amdgpu_kernel void @v_test_imin3_slt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 addrspace(1)* %cptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr i16, i16 addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid + %a = load i16, i16 addrspace(1)* %gep0 + %b = load i16, i16 addrspace(1)* %gep1 + %c = load i16, i16 addrspace(1)* %gep2 + %icmp0 = icmp slt i16 %a, %b + %i0 = select i1 %icmp0, i16 %a, i16 %b + %icmp1 = icmp slt i16 %i0, %c + %i1 = select i1 %icmp1, i16 %i0, i16 %c + store i16 %i1, i16 addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_umin3_ult_i16: +; SI: v_min3_u32 + +; VI: v_min_u16 +; VI: v_min_u16 + +; GFX9: v_min3_u16 +define amdgpu_kernel void @v_test_umin3_ult_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 addrspace(1)* %cptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr i16, i16 addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid + %a = load i16, i16 addrspace(1)* %gep0 + %b = load i16, i16 addrspace(1)* %gep1 + %c = load i16, i16 addrspace(1)* %gep2 + %icmp0 = icmp ult i16 %a, %b + %i0 = select i1 %icmp0, i16 %a, i16 %b + %icmp1 = icmp ult i16 %i0, %c + %i1 = select i1 %icmp1, i16 %i0, i16 %c + store i16 %i1, i16 addrspace(1)* %outgep + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone speculatable } Index: test/MC/AMDGPU/vop3-gfx9.s =================================================================== --- test/MC/AMDGPU/vop3-gfx9.s +++ test/MC/AMDGPU/vop3-gfx9.s @@ -35,6 +35,30 @@ // GFX9: v_xad_u32 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xf3,0xd1,0x02,0x07,0x12,0x04] // NOVI: :1: error: instruction not supported on this GPU +v_min3_f16 v1, v2, v3, v4 +// GFX9: v_min3_f16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xf4,0xd1,0x02,0x07,0x12,0x04] +// NOVI: :1: error: instruction not supported on this GPU + +v_min3_i16 v1, v2, v3, v4 +// GFX9: v_min3_i16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xf5,0xd1,0x02,0x07,0x12,0x04] +// NOVI: :1: error: instruction not supported on this GPU + +v_min3_u16 v1, v2, v3, v4 +// GFX9: v_min3_u16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xf6,0xd1,0x02,0x07,0x12,0x04] +// NOVI: :1: error: instruction not supported on this GPU + +v_max3_f16 v1, v2, v3, v4 +// GFX9: v_max3_f16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xf7,0xd1,0x02,0x07,0x12,0x04] +// NOVI: :1: error: instruction not supported on this GPU + +v_max3_i16 v1, v2, v3, v4 +// GFX9: v_max3_i16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xf8,0xd1,0x02,0x07,0x12,0x04] +// NOVI: :1: error: instruction not supported on this GPU + +v_max3_u16 v1, v2, v3, v4 +// GFX9: v_max3_u16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xf9,0xd1,0x02,0x07,0x12,0x04] +// NOVI: :1: error: instruction not supported on this GPU + v_med3_f16 v1, v2, v3, v4 // GFX9: v_med3_f16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xfa,0xd1,0x02,0x07,0x12,0x04] // NOVI: :1: error: instruction not supported on this GPU