Index: lib/Target/R600/AMDGPUISelLowering.h =================================================================== --- lib/Target/R600/AMDGPUISelLowering.h +++ lib/Target/R600/AMDGPUISelLowering.h @@ -210,6 +210,12 @@ FMIN_LEGACY, SMIN, UMIN, + FMAX3, + SMAX3, + UMAX3, + FMIN3, + SMIN3, + UMIN3, URECIP, DIV_SCALE, DIV_FMAS, Index: lib/Target/R600/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/R600/AMDGPUISelLowering.cpp +++ lib/Target/R600/AMDGPUISelLowering.cpp @@ -2355,6 +2355,12 @@ NODE_NAME_CASE(FMIN_LEGACY) NODE_NAME_CASE(SMIN) NODE_NAME_CASE(UMIN) + NODE_NAME_CASE(FMAX3) + NODE_NAME_CASE(SMAX3) + NODE_NAME_CASE(UMAX3) + NODE_NAME_CASE(FMIN3) + NODE_NAME_CASE(SMIN3) + NODE_NAME_CASE(UMIN3) NODE_NAME_CASE(URECIP) NODE_NAME_CASE(DIV_SCALE) NODE_NAME_CASE(DIV_FMAS) Index: lib/Target/R600/AMDGPUInstrInfo.td =================================================================== --- lib/Target/R600/AMDGPUInstrInfo.td +++ lib/Target/R600/AMDGPUInstrInfo.td @@ -84,7 +84,7 @@ [SDNPAssociative] >; -// out = min(a, b) a snd b are signed ints +// out = min(a, b) a and b are signed ints def AMDGPUsmin : SDNode<"AMDGPUISD::SMIN", SDTIntBinOp, [SDNPCommutative, SDNPAssociative] >; @@ -94,6 +94,37 @@ [SDNPCommutative, SDNPAssociative] >; +// FIXME: TableGen doesn't like commutative instructions with more +// than 2 operands. +// out = max(a, b, c) a, b and c are floats +def AMDGPUfmax3 : SDNode<"AMDGPUISD::FMAX3", SDTFPTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = max(a, b, c) a, b, and c are signed ints +def AMDGPUsmax3 : SDNode<"AMDGPUISD::SMAX3", AMDGPUDTIntTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = max(a, b, c) a, b and c are unsigned ints +def AMDGPUumax3 : SDNode<"AMDGPUISD::UMAX3", AMDGPUDTIntTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = min(a, b, c) a, b and c are floats +def AMDGPUfmin3 : SDNode<"AMDGPUISD::FMIN3", SDTFPTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = min(a, b, c) a, b and c are signed ints +def AMDGPUsmin3 : SDNode<"AMDGPUISD::SMIN3", AMDGPUDTIntTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = min(a, b) a and b are unsigned ints +def AMDGPUumin3 : SDNode<"AMDGPUISD::UMIN3", AMDGPUDTIntTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0", SDTIntToFPOp, []>; @@ -104,7 +135,6 @@ def AMDGPUcvt_f32_ubyte3 : SDNode<"AMDGPUISD::CVT_F32_UBYTE3", SDTIntToFPOp, []>; - // urecip - This operation is a helper for integer division, it returns the // result of 1 / a as a fractional unsigned integer. // out = (2^32 / a) + e Index: lib/Target/R600/SIISelLowering.h =================================================================== --- lib/Target/R600/SIISelLowering.h +++ lib/Target/R600/SIISelLowering.h @@ -59,6 +59,8 @@ unsigned AS, DAGCombinerInfo &DCI) const; + SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const; + public: SITargetLowering(TargetMachine &tm); Index: lib/Target/R600/SIISelLowering.cpp =================================================================== --- lib/Target/R600/SIISelLowering.cpp +++ lib/Target/R600/SIISelLowering.cpp @@ -1305,6 +1305,7 @@ return SDValue(); } + // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) // This is a variant of @@ -1355,6 +1356,61 @@ return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset); } +static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { + switch (Opc) { + case ISD::FMAXNUM: + return AMDGPUISD::FMAX3; + case AMDGPUISD::SMAX: + return AMDGPUISD::SMAX3; + case AMDGPUISD::UMAX: + return AMDGPUISD::UMAX3; + case ISD::FMINNUM: + return AMDGPUISD::FMIN3; + case AMDGPUISD::SMIN: + return AMDGPUISD::SMIN3; + case AMDGPUISD::UMIN: + return AMDGPUISD::UMIN3; + default: + llvm_unreachable("Not a min/max opcode"); + } +} + +SDValue SITargetLowering::performMin3Max3Combine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + + unsigned Opc = N->getOpcode(); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + + // Only do this if the inner op has one use since this will just increases + // register pressure for no benefit. + + // max(max(a, b), c) + if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { + SDLoc DL(N); + return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), + DL, + N->getValueType(0), + Op0.getOperand(0), + Op0.getOperand(1), + Op1); + } + + // max(a, max(b, c)) + if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { + SDLoc DL(N); + return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), + DL, + N->getValueType(0), + Op0, + Op1.getOperand(0), + Op1.getOperand(1)); + } + + return SDValue(); +} + SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -1382,6 +1438,17 @@ } break; } + case ISD::FMAXNUM: // TODO: What about fmax_legacy? + case ISD::FMINNUM: + case AMDGPUISD::SMAX: + case AMDGPUISD::SMIN: + case AMDGPUISD::UMAX: + case AMDGPUISD::UMIN: { + if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && + getTargetMachine().getOptLevel() > CodeGenOpt::None) + return performMin3Max3Combine(N, DCI); + break; + } case AMDGPUISD::CVT_F32_UBYTE0: case AMDGPUISD::CVT_F32_UBYTE1: Index: lib/Target/R600/SIInstructions.td =================================================================== --- lib/Target/R600/SIInstructions.td +++ lib/Target/R600/SIInstructions.td @@ -1544,6 +1544,7 @@ VOP_F64_F64_F64_F64, fma >; //def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>; + defm V_ALIGNBIT_B32 : VOP3Inst , "V_ALIGNBIT_B32", VOP_I32_I32_I32_I32 >; @@ -1552,15 +1553,27 @@ >; defm V_MULLIT_F32 : VOP3Inst , "V_MULLIT_F32", VOP_F32_F32_F32_F32>; -////def V_MIN3_F32 : VOP3_MIN3 <0x00000151, "V_MIN3_F32", []>; -////def V_MIN3_I32 : VOP3_MIN3 <0x00000152, "V_MIN3_I32", []>; -////def V_MIN3_U32 : VOP3_MIN3 <0x00000153, "V_MIN3_U32", []>; -////def V_MAX3_F32 : VOP3_MAX3 <0x00000154, "V_MAX3_F32", []>; -////def V_MAX3_I32 : VOP3_MAX3 <0x00000155, "V_MAX3_I32", []>; -////def V_MAX3_U32 : VOP3_MAX3 <0x00000156, "V_MAX3_U32", []>; -////def V_MED3_F32 : VOP3_MED3 <0x00000157, "V_MED3_F32", []>; -////def V_MED3_I32 : VOP3_MED3 <0x00000158, "V_MED3_I32", []>; -////def V_MED3_U32 : VOP3_MED3 <0x00000159, "V_MED3_U32", []>; +defm V_MIN3_F32 : VOP3Inst , "V_MIN3_F32", + VOP_F32_F32_F32_F32, AMDGPUfmin3>; + +defm V_MIN3_I32 : VOP3Inst , "V_MIN3_I32", + VOP_I32_I32_I32_I32, AMDGPUsmin3 +>; +defm V_MIN3_U32 : VOP3Inst , "V_MIN3_U32", + VOP_I32_I32_I32_I32, AMDGPUumin3 +>; +defm V_MAX3_F32 : VOP3Inst , "V_MAX3_F32", + VOP_F32_F32_F32_F32, AMDGPUfmax3 +>; +defm V_MAX3_I32 : VOP3Inst , "V_MAX3_I32", + VOP_I32_I32_I32_I32, AMDGPUsmax3 +>; +defm V_MAX3_U32 : VOP3Inst , "V_MAX3_U32", + VOP_I32_I32_I32_I32, AMDGPUumax3 +>; +//def V_MED3_F32 : VOP3_MED3 <0x00000157, "V_MED3_F32", []>; +//def V_MED3_I32 : VOP3_MED3 <0x00000158, "V_MED3_I32", []>; +//def V_MED3_U32 : VOP3_MED3 <0x00000159, "V_MED3_U32", []>; //def V_SAD_U8 : VOP3_U8 <0x0000015a, "V_SAD_U8", []>; //def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "V_SAD_HI_U8", []>; //def V_SAD_U16 : VOP3_U16 <0x0000015c, "V_SAD_U16", []>; Index: test/CodeGen/R600/fmax3.ll =================================================================== --- /dev/null +++ test/CodeGen/R600/fmax3.ll @@ -0,0 +1,65 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: @test_fmax3_ogt_0: +; SI: BUFFER_LOAD_DWORD [[REGA:v[0-9]+]] +; SI: BUFFER_LOAD_DWORD [[REGB:v[0-9]+]] +; SI: BUFFER_LOAD_DWORD [[REGC:v[0-9]+]] +; SI: V_MAX3_F32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; SI: BUFFER_STORE_DWORD [[RESULT]], +; SI: S_ENDPGM +define void @test_fmax3_ogt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { + %a = load float addrspace(1)* %aptr, align 4 + %b = load float addrspace(1)* %bptr, align 4 + %c = load float addrspace(1)* %cptr, align 4 + %fcmp0 = fcmp ogt float %a, %b + %f0 = select i1 %fcmp0, float %a, float %b + %fcmp1 = fcmp ogt float %f0, %c + %f1 = select i1 %fcmp1, float %f0, float %c + store float %f1, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: @test_fmax3_ogt_1: +; SI: V_MAX3_F32 +; SI: S_ENDPGM +define void @test_fmax3_ogt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { + %a = load float addrspace(1)* %aptr, align 4 + %b = load float addrspace(1)* %bptr, align 4 + %c = load float addrspace(1)* %cptr, align 4 + %fcmp0 = fcmp ogt float %a, %b + %f0 = select i1 %fcmp0, float %a, float %b + %fcmp1 = fcmp ole float %c, %f0 + %f1 = select i1 %fcmp1, float %f0, float %c + store float %f1, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: @test_fmax3_ogt_2: +; SI-NOT: V_MAX3_F32 +; SI: S_ENDPGM +define void @test_fmax3_ogt_2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { + %a = load float addrspace(1)* %aptr, align 4 + %b = load float addrspace(1)* %bptr, align 4 + %c = load float addrspace(1)* %cptr, align 4 + %fcmp0 = fcmp ogt float %a, %b + %f0 = select i1 %fcmp0, float %a, float %b + %fcmp1 = fcmp ogt float %c, %f0 + %f1 = select i1 %fcmp1, float %f0, float %c + store float %f1, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: @test_fmax3_oge_0 +; SI: V_MAX3_F32 +; SI: S_ENDPGM +define void @test_fmax3_oge_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { + %a = load float addrspace(1)* %aptr, align 4 + %b = load float addrspace(1)* %bptr, align 4 + %c = load float addrspace(1)* %cptr, align 4 + %fcmp0 = fcmp oge float %a, %b + %f0 = select i1 %fcmp0, float %a, float %b + %fcmp1 = fcmp oge float %f0, %c + %f1 = select i1 %fcmp1, float %f0, float %c + store float %f1, float addrspace(1)* %out, align 4 + ret void +} Index: test/CodeGen/R600/fmin3.ll =================================================================== --- /dev/null +++ test/CodeGen/R600/fmin3.ll @@ -0,0 +1,65 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: @test_fmin3_olt_0: +; SI: BUFFER_LOAD_DWORD [[REGA:v[0-9]+]] +; SI: BUFFER_LOAD_DWORD [[REGB:v[0-9]+]] +; SI: BUFFER_LOAD_DWORD [[REGC:v[0-9]+]] +; SI: V_MIN3_F32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; SI: BUFFER_STORE_DWORD [[RESULT]], +; SI: S_ENDPGM +define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { + %a = load float addrspace(1)* %aptr, align 4 + %b = load float addrspace(1)* %bptr, align 4 + %c = load float addrspace(1)* %cptr, align 4 + %fcmp0 = fcmp olt float %a, %b + %f0 = select i1 %fcmp0, float %a, float %b + %fcmp1 = fcmp olt float %f0, %c + %f1 = select i1 %fcmp1, float %f0, float %c + store float %f1, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: @test_fmin3_olt_1: +; SI: V_MIN3_F32 +; SI: S_ENDPGM +define void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { + %a = load float addrspace(1)* %aptr, align 4 + %b = load float addrspace(1)* %bptr, align 4 + %c = load float addrspace(1)* %cptr, align 4 + %fcmp0 = fcmp olt float %a, %b + %f0 = select i1 %fcmp0, float %a, float %b + %fcmp1 = fcmp oge float %c, %f0 + %f1 = select i1 %fcmp1, float %f0, float %c + store float %f1, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: @test_fmin3_olt_2: +; SI-NOT: V_MIN3_F32 +; SI: S_ENDPGM +define void @test_fmin3_olt_2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { + %a = load float addrspace(1)* %aptr, align 4 + %b = load float addrspace(1)* %bptr, align 4 + %c = load float addrspace(1)* %cptr, align 4 + %fcmp0 = fcmp ole float %a, %b + %f0 = select i1 %fcmp0, float %a, float %b + %fcmp1 = fcmp olt float %c, %f0 + %f1 = select i1 %fcmp1, float %f0, float %c + store float %f1, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: @test_fmin3_olt_3: +; SI-NOT: V_MIN3_F32 +; SI: S_ENDPGM +define void @test_fmin3_olt_3(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { + %a = load float addrspace(1)* %aptr, align 4 + %b = load float addrspace(1)* %bptr, align 4 + %c = load float addrspace(1)* %cptr, align 4 + %fcmp0 = fcmp olt float %a, %b + %f0 = select i1 %fcmp0, float %a, float %b + %fcmp1 = fcmp olt float %c, %f0 + %f1 = select i1 %fcmp1, float %f0, float %c + store float %f1, float addrspace(1)* %out, align 4 + ret void +} Index: test/CodeGen/R600/max3.ll =================================================================== --- /dev/null +++ test/CodeGen/R600/max3.ll @@ -0,0 +1,41 @@ +; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; FUNC-LABEL: @v_test_imax3_sgt_i32 +; SI: V_MAX3_I32 +define void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid + %a = load i32 addrspace(1)* %gep0, align 4 + %b = load i32 addrspace(1)* %gep1, align 4 + %c = load i32 addrspace(1)* %gep2, align 4 + %icmp0 = icmp sgt i32 %a, %b + %i0 = select i1 %icmp0, i32 %a, i32 %b + %icmp1 = icmp sgt i32 %i0, %c + %i1 = select i1 %icmp1, i32 %i0, i32 %c + store i32 %i1, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @v_test_umax3_ugt_i32 +; SI: V_MAX3_U32 +define void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid + %a = load i32 addrspace(1)* %gep0, align 4 + %b = load i32 addrspace(1)* %gep1, align 4 + %c = load i32 addrspace(1)* %gep2, align 4 + %icmp0 = icmp ugt i32 %a, %b + %i0 = select i1 %icmp0, i32 %a, i32 %b + %icmp1 = icmp ugt i32 %i0, %c + %i1 = select i1 %icmp1, i32 %i0, i32 %c + store i32 %i1, i32 addrspace(1)* %out, align 4 + ret void +} Index: test/CodeGen/R600/min3.ll =================================================================== --- /dev/null +++ test/CodeGen/R600/min3.ll @@ -0,0 +1,111 @@ +; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; FUNC-LABEL: @v_test_imin3_slt_i32 +; SI: V_MIN3_I32 +define void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid + %a = load i32 addrspace(1)* %gep0, align 4 + %b = load i32 addrspace(1)* %gep1, align 4 + %c = load i32 addrspace(1)* %gep2, align 4 + %icmp0 = icmp slt i32 %a, %b + %i0 = select i1 %icmp0, i32 %a, i32 %b + %icmp1 = icmp slt i32 %i0, %c + %i1 = select i1 %icmp1, i32 %i0, i32 %c + store i32 %i1, i32 addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: @v_test_umin3_ult_i32 +; SI: V_MIN3_U32 +define void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid + %a = load i32 addrspace(1)* %gep0, align 4 + %b = load i32 addrspace(1)* %gep1, align 4 + %c = load i32 addrspace(1)* %gep2, align 4 + %icmp0 = icmp ult i32 %a, %b + %i0 = select i1 %icmp0, i32 %a, i32 %b + %icmp1 = icmp ult i32 %i0, %c + %i1 = select i1 %icmp1, i32 %i0, i32 %c + store i32 %i1, i32 addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: @v_test_umin_umin_umin +; SI: V_MIN_I32 +; SI: V_MIN3_I32 +define void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %tid2 = mul i32 %tid, 2 + %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid + + %gep3 = getelementptr i32 addrspace(1)* %aptr, i32 %tid2 + %gep4 = getelementptr i32 addrspace(1)* %bptr, i32 %tid2 + %gep5 = getelementptr i32 addrspace(1)* %cptr, i32 %tid2 + + %outgep0 = getelementptr i32 addrspace(1)* %out, i32 %tid + %outgep1 = getelementptr i32 addrspace(1)* %out, i32 %tid2 + + %a = load i32 addrspace(1)* %gep0, align 4 + %b = load i32 addrspace(1)* %gep1, align 4 + %c = load i32 addrspace(1)* %gep2, align 4 + %d = load i32 addrspace(1)* %gep3, align 4 + + %icmp0 = icmp slt i32 %a, %b + %i0 = select i1 %icmp0, i32 %a, i32 %b + + %icmp1 = icmp slt i32 %c, %d + %i1 = select i1 %icmp1, i32 %c, i32 %d + + %icmp2 = icmp slt i32 %i0, %i1 + %i2 = select i1 %icmp2, i32 %i0, i32 %i1 + + store i32 %i2, i32 addrspace(1)* %outgep1, align 4 + ret void +} + +; FUNC-LABEL: @v_test_umin3_2_uses +; SI-NOT: V_MIN3 +define void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %tid2 = mul i32 %tid, 2 + %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid + + %gep3 = getelementptr i32 addrspace(1)* %aptr, i32 %tid2 + %gep4 = getelementptr i32 addrspace(1)* %bptr, i32 %tid2 + %gep5 = getelementptr i32 addrspace(1)* %cptr, i32 %tid2 + + %outgep0 = getelementptr i32 addrspace(1)* %out, i32 %tid + %outgep1 = getelementptr i32 addrspace(1)* %out, i32 %tid2 + + %a = load i32 addrspace(1)* %gep0, align 4 + %b = load i32 addrspace(1)* %gep1, align 4 + %c = load i32 addrspace(1)* %gep2, align 4 + %d = load i32 addrspace(1)* %gep3, align 4 + + %icmp0 = icmp slt i32 %a, %b + %i0 = select i1 %icmp0, i32 %a, i32 %b + + %icmp1 = icmp slt i32 %c, %d + %i1 = select i1 %icmp1, i32 %c, i32 %d + + %icmp2 = icmp slt i32 %i0, %c + %i2 = select i1 %icmp2, i32 %i0, i32 %c + + store i32 %i2, i32 addrspace(1)* %outgep0, align 4 + store i32 %i0, i32 addrspace(1)* %outgep1, align 4 + ret void +}