Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1041,8 +1041,10 @@ addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); - setOperationAction(ISD::FMAXIMUM, MVT::f64, Custom); - setOperationAction(ISD::FMINIMUM, MVT::f64, Custom); + for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) { + setOperationAction(ISD::FMAXIMUM, VT, Custom); + setOperationAction(ISD::FMINIMUM, VT, Custom); + } for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16, MVT::v2i32 }) { @@ -1368,6 +1370,11 @@ addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); + for (auto VT : { MVT::v8f32, MVT::v4f64 }) { + setOperationAction(ISD::FMAXIMUM, VT, Custom); + setOperationAction(ISD::FMINIMUM, VT, Custom); + } + for (auto VT : { MVT::v8f32, MVT::v4f64 }) { setOperationAction(ISD::FFLOOR, VT, Legal); setOperationAction(ISD::STRICT_FFLOOR, VT, Legal); @@ -1700,6 +1707,11 @@ addRegisterClass(MVT::v32f16, &X86::VR512RegClass); addRegisterClass(MVT::v64i8, &X86::VR512RegClass); + for (auto VT : { MVT::v16f32, MVT::v8f64 }) { + setOperationAction(ISD::FMAXIMUM, VT, Custom); + setOperationAction(ISD::FMINIMUM, VT, Custom); + } + for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) { setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal); setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal); @@ -30234,9 +30246,9 @@ SDValue X = Op.getOperand(0); SDValue Y = Op.getOperand(1); SDLoc DL(Op); - uint64_t SizeInBits = VT.getFixedSizeInBits(); + uint64_t SizeInBits = VT.getScalarSizeInBits(); APInt PreferredZero = APInt::getZero(SizeInBits); - EVT IVT = MVT::getIntegerVT(SizeInBits); + EVT IVT = VT.changeTypeToInteger(); X86ISD::NodeType MinMaxOp; if (Op.getOpcode() == ISD::FMAXIMUM) { MinMaxOp = X86ISD::FMAX; @@ -30270,6 +30282,19 @@ return CstOp->getValueAPF().bitcastToAPInt() == PreferredZero; if (auto *CstOp = dyn_cast(Op)) return CstOp->getAPIntValue() == PreferredZero; + if (Op->getOpcode() == ISD::BUILD_VECTOR || + Op->getOpcode() == ISD::SPLAT_VECTOR) { + for (const SDValue &OpVal : Op->op_values()) { + if (OpVal.isUndef()) + continue; + auto *CstOp = dyn_cast(OpVal); + if (!CstOp) + return false; + if (CstOp->getValueAPF().bitcastToAPInt() != PreferredZero) + return false; + } + return true; + } return false; }; Index: llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll +++ llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll @@ -207,22 +207,22 @@ define void @fmaximum(float %a, float %b, <16 x float> %va, <16 x float> %vb) { ; THRU-LABEL: 'fmaximum' ; THRU-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b) -; THRU-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb) +; THRU-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb) ; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; LATE-LABEL: 'fmaximum' ; LATE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b) -; LATE-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb) +; LATE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb) ; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SIZE-LABEL: 'fmaximum' ; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b) -; SIZE-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb) +; SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb) ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SIZE_LATE-LABEL: 'fmaximum' ; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b) -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb) +; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb) ; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %s = call float @llvm.maximum.f32(float %a, float %b) Index: llvm/test/CodeGen/X86/fminimum-fmaximum.ll =================================================================== --- llvm/test/CodeGen/X86/fminimum-fmaximum.ll +++ llvm/test/CodeGen/X86/fminimum-fmaximum.ll @@ -100,59 +100,17 @@ define <4 x float> @test_fmaximum_scalarize(<4 x float> %x, <4 x float> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" { ; SSE2-LABEL: test_fmaximum_scalarize: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps %xmm1, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] -; SSE2-NEXT: movaps %xmm0, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3] -; SSE2-NEXT: maxss %xmm2, %xmm3 -; SSE2-NEXT: movaps %xmm1, %xmm2 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE2-NEXT: movaps %xmm0, %xmm4 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; SSE2-NEXT: maxss %xmm2, %xmm4 -; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: maxss %xmm1, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE2-NEXT: maxss %xmm1, %xmm0 -; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: maxps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_fmaximum_scalarize: ; AVX: # %bb.0: -; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] -; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] -; AVX-NEXT: vmaxss %xmm3, %xmm4, %xmm3 -; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] -; AVX-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] -; AVX-NEXT: vmaxss %xmm3, %xmm4, %xmm3 -; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; X86-LABEL: test_fmaximum_scalarize: ; X86: # %bb.0: -; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm2 -; X86-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] -; X86-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] -; X86-NEXT: vmaxss %xmm3, %xmm4, %xmm3 -; X86-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; X86-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] -; X86-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] -; X86-NEXT: vmaxss %xmm3, %xmm4, %xmm3 -; X86-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; X86-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm0 -; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; X86-NEXT: retl %r = call <4 x float> @llvm.maximum.v4f32(<4 x float> %x, <4 x float> %y) ret <4 x float> %r @@ -631,31 +589,17 @@ define <2 x double> @test_fminimum_scalarize(<2 x double> %x, <2 x double> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" { ; SSE2-LABEL: test_fminimum_scalarize: ; SSE2: # %bb.0: -; SSE2-NEXT: movapd %xmm0, %xmm2 -; SSE2-NEXT: minsd %xmm1, %xmm2 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE2-NEXT: minsd %xmm1, %xmm0 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE2-NEXT: movapd %xmm2, %xmm0 +; SSE2-NEXT: minpd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: test_fminimum_scalarize: ; AVX: # %bb.0: -; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; X86-LABEL: test_fminimum_scalarize: ; X86: # %bb.0: -; X86-NEXT: vminsd %xmm1, %xmm0, %xmm2 -; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] -; X86-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; X86-NEXT: vminsd %xmm1, %xmm0, %xmm0 -; X86-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0 ; X86-NEXT: retl %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> %y) ret <2 x double> %r