Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -169,6 +169,12 @@ [FeatureFP64] >; +def FeatureFPExceptions : SubtargetFeature<"fp-exceptions", + "FPExceptions", + "true", + "Enable floating point exceptions" +>; + def FeatureEnableHugeScratchBuffer : SubtargetFeature< "huge-scratch-buffer", "EnableHugeScratchBuffer", Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -257,6 +257,9 @@ FMIN3, SMIN3, UMIN3, + FMED3, + SMED3, + UMED3, URECIP, DIV_SCALE, DIV_FMAS, Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -397,7 +397,7 @@ // SI at least has hardware support for floating point exceptions, but no way // of using or handling them is implemented. They are also optional in OpenCL // (Section 7.3) - setHasFloatingPointExceptions(false); + setHasFloatingPointExceptions(Subtarget->hasFPExceptions()); setSelectIsExpensive(false); PredictableSelectIsExpensive = false; @@ -2949,6 +2949,9 @@ NODE_NAME_CASE(FMIN3) NODE_NAME_CASE(SMIN3) NODE_NAME_CASE(UMIN3) + NODE_NAME_CASE(FMED3) + NODE_NAME_CASE(SMED3) + NODE_NAME_CASE(UMED3) NODE_NAME_CASE(URECIP) NODE_NAME_CASE(DIV_SCALE) NODE_NAME_CASE(DIV_FMAS) Index: lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -209,6 +209,16 @@ [] >; +def AMDGPUsmed3 : SDNode<"AMDGPUISD::SMED3", AMDGPUDTIntTernaryOp, + [] +>; + +def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp, + [] +>; + +def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>; + def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG", SDTypeProfile<0, 1, [SDTCisInt<0>]>, [SDNPHasChain, SDNPInGlue]>; Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -66,6 +66,7 @@ bool FP64; bool FP64Denormals; bool FP32Denormals; + bool FPExceptions; bool FastFMAF32; bool HalfRate64Ops; bool CaymanISA; @@ -150,6 +151,10 @@ return FP64Denormals; } + bool hasFPExceptions() const { + return FPExceptions; + } + bool hasFastFMAF32() const { return FastFMAF32; } Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -66,9 +66,9 @@ : AMDGPUGenSubtargetInfo(TT, GPU, FS), DumpCode(false), R600ALUInst(false), HasVertexCache(false), TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false), - FP64Denormals(false), FP32Denormals(false), FastFMAF32(false), - HalfRate64Ops(false), CaymanISA(false), FlatAddressSpace(false), - FlatForGlobal(false), EnableIRStructurizer(true), + FP64Denormals(false), FP32Denormals(false), FPExceptions(false), + FastFMAF32(false), HalfRate64Ops(false), CaymanISA(false), + FlatAddressSpace(false), FlatForGlobal(false), EnableIRStructurizer(true), EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false), Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -54,7 +54,8 @@ SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const; - SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const; bool isLegalFlatAddressingMode(const AddrMode &AM) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -2131,8 +2131,70 @@ } } -SDValue SITargetLowering::performMin3Max3Combine(SDNode *N, - DAGCombinerInfo &DCI) const { +static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, + SDLoc SL, + SDValue Op0, + SDValue Op1, + bool Signed) { + ConstantSDNode *K1 = dyn_cast(Op1); + if (!K1) + return SDValue(); + + ConstantSDNode *K0 = dyn_cast(Op0.getOperand(1)); + if (!K0) + return SDValue(); + + + if (Signed) { + if (K0->getAPIntValue().sge(K1->getAPIntValue())) + return SDValue(); + } else { + if (K0->getAPIntValue().uge(K1->getAPIntValue())) + return SDValue(); + } + + EVT VT = K0->getValueType(0); + return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT, + Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0)); +} + +static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { + if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions()) + return true; + + return DAG.isKnownNeverNaN(Op); +} + +static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, + SDLoc SL, + SDValue Op0, + SDValue Op1) { + ConstantFPSDNode *K1 = dyn_cast(Op1); + if (!K1) + return SDValue(); + + ConstantFPSDNode *K0 = dyn_cast(Op0.getOperand(1)); + if (!K0) + return SDValue(); + + // Ordered >= (although NaN inputs should have folded away by now). + APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF()); + if (Cmp == APFloat::cmpGreaterThan) + return SDValue(); + + // This isn't safe with signaling NaNs because in IEEE mode, min/max on a + // signaling NaN gives a quiet NaN. The quiet NaN input to the min would then + // give the other result, which is different from med3 with a NaN input. + SDValue Var = Op0.getOperand(0); + if (!isKnownNeverSNan(DAG, Var)) + return SDValue(); + + return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), + Var, SDValue(K0, 0), SDValue(K1, 0)); +} + +SDValue SITargetLowering::performMinMaxCombine(SDNode *N, + DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; unsigned Opc = N->getOpcode(); @@ -2142,7 +2204,8 @@ // Only do this if the inner op has one use since this will just increases // register pressure for no benefit. - // max(max(a, b), c) + // max(max(a, b), c) -> max3(a, b, c) + // min(min(a, b), c) -> min3(a, b, c) if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { SDLoc DL(N); return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), @@ -2153,7 +2216,9 @@ Op1); } - // max(a, max(b, c)) + // Try commuted. + // max(a, max(b, c)) -> max3(a, b, c) + // min(a, min(b, c)) -> min3(a, b, c) if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { SDLoc DL(N); return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), @@ -2164,6 +2229,24 @@ Op1.getOperand(1)); } + // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1) + if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) { + if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true)) + return Med3; + } + + if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) { + if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false)) + return Med3; + } + + // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1) + if (Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM && + N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) { + if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1)) + return Res; + } + return SDValue(); } @@ -2217,7 +2300,7 @@ if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && N->getValueType(0) != MVT::f64 && getTargetMachine().getOptLevel() > CodeGenOpt::None) - return performMin3Max3Combine(N, DCI); + return performMinMaxCombine(N, DCI); break; } Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -1695,13 +1695,13 @@ VOP_I32_I32_I32_I32, AMDGPUumax3 >; defm V_MED3_F32 : VOP3Inst , "v_med3_f32", - VOP_F32_F32_F32_F32 + VOP_F32_F32_F32_F32, AMDGPUfmed3 >; defm V_MED3_I32 : VOP3Inst , "v_med3_i32", - VOP_I32_I32_I32_I32 + VOP_I32_I32_I32_I32, AMDGPUsmed3 >; defm V_MED3_U32 : VOP3Inst , "v_med3_u32", - VOP_I32_I32_I32_I32 + VOP_I32_I32_I32_I32, AMDGPUumed3 >; //def V_SAD_U8 : VOP3_U8 <0x0000015a, "v_sad_u8", []>; Index: test/CodeGen/AMDGPU/fmed3.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/fmed3.ll @@ -0,0 +1,131 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -check-prefix=SNAN -check-prefix=GCN %s + +declare i32 @llvm.r600.read.tidig.x() #0 +declare float @llvm.minnum.f32(float, float) #0 +declare float @llvm.maxnum.f32(float, float) #0 +declare double @llvm.minnum.f64(double, double) #0 +declare double @llvm.maxnum.f64(double, double) #0 + +; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_f32: +; NOSNAN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 + +; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} +; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} +define void @v_test_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + + %max = call float @llvm.maxnum.f32(float %a, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + + store float %med, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_commute0_f32: +; NOSNAN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 + +; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} +; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} +define void @v_test_fmed3_r_i_i_commute0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + + %max = call float @llvm.maxnum.f32(float 2.0, float %a) + %med = call float @llvm.minnum.f32(float 4.0, float %max) + + store float %med, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_commute1_f32: +; NOSNAN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 + +; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} +; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} +define void @v_test_fmed3_r_i_i_commute1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + + %max = call float @llvm.maxnum.f32(float %a, float 2.0) + %med = call float @llvm.minnum.f32(float 4.0, float %max) + + store float %med, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_constant_order_f32: +; GCN: v_max_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} +; GCN: v_min_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} +define void @v_test_fmed3_r_i_i_constant_order_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + + %max = call float @llvm.maxnum.f32(float %a, float 4.0) + %med = call float @llvm.minnum.f32(float %max, float 2.0) + + store float %med, float addrspace(1)* %outgep + ret void +} + + +; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_multi_use_f32: +; GCN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} +; GCN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} +define void @v_test_fmed3_r_i_i_multi_use_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + + %max = call float @llvm.maxnum.f32(float %a, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + + store volatile float %med, float addrspace(1)* %outgep + store volatile float %max, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_f64: +; GCN: v_max_f64 {{v\[[0-9]+:[0-9]+\]}}, 2.0, {{v\[[0-9]+:[0-9]+\]}} +; GCN: v_min_f64 {{v\[[0-9]+:[0-9]+\]}}, 4.0, {{v\[[0-9]+:[0-9]+\]}} +define void @v_test_fmed3_r_i_i_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr double, double addrspace(1)* %out, i32 %tid + %a = load double, double addrspace(1)* %gep0 + + %max = call double @llvm.maxnum.f64(double %a, double 2.0) + %med = call double @llvm.minnum.f64(double %max, double 4.0) + + store double %med, double addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_no_nans_f32: +; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 +define void @v_test_fmed3_r_i_i_no_nans_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + + %max = call float @llvm.maxnum.f32(float %a, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + + store float %med, float addrspace(1)* %outgep + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" } +attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" } Index: test/CodeGen/AMDGPU/smed3.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/smed3.ll @@ -0,0 +1,120 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +declare i32 @llvm.r600.read.tidig.x() #0 + +; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i32: +; GCN: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 +define void @v_test_smed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0 + + %icmp0 = icmp sgt i32 %a, 12 + %i0 = select i1 %icmp0, i32 %a, i32 12 + + %icmp1 = icmp slt i32 %i0, 17 + %i1 = select i1 %icmp1, i32 %i0, i32 17 + + store i32 %i1, i32 addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_smed3_multi_use_r_i_i_i32: +; GCN: v_max_i32 +; GCN: v_min_i32 +define void @v_test_smed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0 + + %icmp0 = icmp sgt i32 %a, 12 + %i0 = select i1 %icmp0, i32 %a, i32 12 + + %icmp1 = icmp slt i32 %i0, 17 + %i1 = select i1 %icmp1, i32 %i0, i32 17 + + store volatile i32 %i0, i32 addrspace(1)* %outgep + store volatile i32 %i1, i32 addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_smed3_r_i_i_constant_order_i32: +; GCN: v_max_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}} +; GCN: v_min_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}} +define void @v_test_smed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0 + + %icmp0 = icmp sgt i32 %a, 17 + %i0 = select i1 %icmp0, i32 %a, i32 17 + + %icmp1 = icmp slt i32 %i0, 12 + %i1 = select i1 %icmp1, i32 %i0, i32 12 + + store i32 %i1, i32 addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_smed3_r_i_i_sign_mismatch_i32: +; GCN: v_max_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}} +; GCN: v_min_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}} +define void @v_test_smed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0 + + %icmp0 = icmp ugt i32 %a, 12 + %i0 = select i1 %icmp0, i32 %a, i32 12 + + %icmp1 = icmp slt i32 %i0, 17 + %i1 = select i1 %icmp1, i32 %i0, i32 17 + + store i32 %i1, i32 addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i64: +; GCN: v_cmp_lt_i64 +; GCN: v_cmp_gt_i64 +define void @v_test_smed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid + %a = load i64, i64 addrspace(1)* %gep0 + + %icmp0 = icmp sgt i64 %a, 12 + %i0 = select i1 %icmp0, i64 %a, i64 12 + + %icmp1 = icmp slt i64 %i0, 17 + %i1 = select i1 %icmp1, i64 %i0, i64 17 + + store i64 %i1, i64 addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i16: +; GCN: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 +define void @v_test_smed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid + %a = load i16, i16 addrspace(1)* %gep0 + + %icmp0 = icmp sgt i16 %a, 12 + %i0 = select i1 %icmp0, i16 %a, i16 12 + + %icmp1 = icmp slt i16 %i0, 17 + %i1 = select i1 %icmp1, i16 %i0, i16 17 + + store i16 %i1, i16 addrspace(1)* %outgep + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } Index: test/CodeGen/AMDGPU/umed3.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/umed3.ll @@ -0,0 +1,119 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +declare i32 @llvm.r600.read.tidig.x() #0 + +; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i32: +; GCN: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 +define void @v_test_umed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0 + + %icmp0 = icmp ugt i32 %a, 12 + %i0 = select i1 %icmp0, i32 %a, i32 12 + + %icmp1 = icmp ult i32 %i0, 17 + %i1 = select i1 %icmp1, i32 %i0, i32 17 + + store i32 %i1, i32 addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_umed3_multi_use_r_i_i_i32: +; GCN: v_max_u32 +; GCN: v_min_u32 +define void @v_test_umed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0 + + %icmp0 = icmp ugt i32 %a, 12 + %i0 = select i1 %icmp0, i32 %a, i32 12 + + %icmp1 = icmp ult i32 %i0, 17 + %i1 = select i1 %icmp1, i32 %i0, i32 17 + + store volatile i32 %i0, i32 addrspace(1)* %outgep + store volatile i32 %i1, i32 addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_umed3_r_i_i_constant_order_i32: +; GCN: v_max_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}} +; GCN: v_min_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}} +define void @v_test_umed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0 + + %icmp0 = icmp ugt i32 %a, 17 + %i0 = select i1 %icmp0, i32 %a, i32 17 + + %icmp1 = icmp ult i32 %i0, 12 + %i1 = select i1 %icmp1, i32 %i0, i32 12 + + store i32 %i1, i32 addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_umed3_r_i_i_sign_mismatch_i32: +; GCN: v_max_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}} +; GCN: v_min_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}} +define void @v_test_umed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0 + + %icmp0 = icmp sgt i32 %a, 12 + %i0 = select i1 %icmp0, i32 %a, i32 12 + + %icmp1 = icmp ult i32 %i0, 17 + %i1 = select i1 %icmp1, i32 %i0, i32 17 + + store i32 %i1, i32 addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i64: +; GCN: v_cmp_lt_u64 +; GCN: v_cmp_gt_u64 +define void @v_test_umed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid + %a = load i64, i64 addrspace(1)* %gep0 + + %icmp0 = icmp ugt i64 %a, 12 + %i0 = select i1 %icmp0, i64 %a, i64 12 + + %icmp1 = icmp ult i64 %i0, 17 + %i1 = select i1 %icmp1, i64 %i0, i64 17 + + store i64 %i1, i64 addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i16: +; GCN: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 +define void @v_test_umed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid + %a = load i16, i16 addrspace(1)* %gep0 + + %icmp0 = icmp ugt i16 %a, 12 + %i0 = select i1 %icmp0, i16 %a, i16 12 + + %icmp1 = icmp ult i16 %i0, 17 + %i1 = select i1 %icmp1, i16 %i0, i16 17 + + store i16 %i1, i16 addrspace(1)* %outgep + ret void +} +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind }