Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -1187,6 +1187,7 @@ setOperationAction(ISD::FNEG, VT, Custom); setOperationAction(ISD::FABS, VT, Custom); setOperationAction(ISD::FMA, VT, Legal); + setOperationAction(ISD::FCOPYSIGN, VT, Custom); } setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); @@ -15043,7 +15044,7 @@ bool IsF128 = (VT == MVT::f128); assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || - VT == MVT::v8f32) && + VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN"); MVT EltVT = VT.getScalarType(); Index: llvm/trunk/test/Analysis/CostModel/X86/arith-fp.ll =================================================================== --- llvm/trunk/test/Analysis/CostModel/X86/arith-fp.ll +++ llvm/trunk/test/Analysis/CostModel/X86/arith-fp.ll @@ -417,7 +417,7 @@ ; SSE42: cost of 8 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32 ; AVX: cost of 4 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32 ; AVX2: cost of 4 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32 - ; AVX512: cost of 77 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32 + ; AVX512: cost of 2 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32 %V16F32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef) ; SSE2: cost of 2 {{.*}} %F64 = call double @llvm.copysign.f64 @@ -442,7 +442,7 @@ ; SSE42: cost of 8 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64 ; AVX: cost of 4 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64 ; AVX2: cost of 4 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64 - ; AVX512: cost of 37 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64 + ; AVX512: cost of 2 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64 %V8F64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef) ret i32 undef Index: llvm/trunk/test/CodeGen/X86/vec-copysign-avx512.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vec-copysign-avx512.ll +++ llvm/trunk/test/CodeGen/X86/vec-copysign-avx512.ll @@ -0,0 +1,118 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-macosx10.10.0 -mattr=+avx512vl | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL +; RUN: llc < %s -mtriple=x86_64-apple-macosx10.10.0 -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VLDQ + +define <4 x float> @v4f32(<4 x float> %a, <4 x float> %b) nounwind { +; AVX512VL-LABEL: v4f32: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 +; AVX512VL-NEXT: vandps %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 +; AVX512VL-NEXT: vandps %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512VLDQ-LABEL: v4f32: +; AVX512VLDQ: ## BB#0: +; AVX512VLDQ-NEXT: vandps {{.*}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512VLDQ-NEXT: vandps {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %tmp = tail call <4 x float> @llvm.copysign.v4f32( <4 x float> %a, <4 x float> %b ) + ret <4 x float> %tmp +} + +define <8 x float> @v8f32(<8 x float> %a, <8 x float> %b) nounwind { +; AVX512VL-LABEL: v8f32: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 +; AVX512VL-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 +; AVX512VL-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512VLDQ-LABEL: v8f32: +; AVX512VLDQ: ## BB#0: +; AVX512VLDQ-NEXT: vandps {{.*}}(%rip){1to8}, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vandps {{.*}}(%rip){1to8}, %ymm0, %ymm0 +; AVX512VLDQ-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX512VLDQ-NEXT: retq + %tmp = tail call <8 x float> @llvm.copysign.v8f32( <8 x float> %a, <8 x float> %b ) + ret <8 x float> %tmp +} + +define <16 x float> @v16f32(<16 x float> %a, <16 x float> %b) nounwind { +; AVX512VL-LABEL: v16f32: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm1, %zmm1 +; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: retq +; +; AVX512VLDQ-LABEL: v16f32: +; AVX512VLDQ: ## BB#0: +; AVX512VLDQ-NEXT: vandps {{.*}}(%rip){1to16}, %zmm1, %zmm1 +; AVX512VLDQ-NEXT: vandps {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512VLDQ-NEXT: vorps %zmm1, %zmm0, %zmm0 +; AVX512VLDQ-NEXT: retq + %tmp = tail call <16 x float> @llvm.copysign.v16f32( <16 x float> %a, <16 x float> %b ) + ret <16 x float> %tmp +} + +define <2 x double> @v2f64(<2 x double> %a, <2 x double> %b) nounwind { +; CHECK-LABEL: v2f64: +; CHECK: ## BB#0: +; CHECK-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: vorps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %tmp = tail call <2 x double> @llvm.copysign.v2f64( <2 x double> %a, <2 x double> %b ) + ret <2 x double> %tmp +} + +define <4 x double> @v4f64(<4 x double> %a, <4 x double> %b) nounwind { +; AVX512VL-LABEL: v4f64: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vbroadcastsd {{.*}}(%rip), %ymm2 +; AVX512VL-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vbroadcastsd {{.*}}(%rip), %ymm2 +; AVX512VL-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512VLDQ-LABEL: v4f64: +; AVX512VLDQ: ## BB#0: +; AVX512VLDQ-NEXT: vandpd {{.*}}(%rip){1to4}, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vandpd {{.*}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512VLDQ-NEXT: vorpd %ymm1, %ymm0, %ymm0 +; AVX512VLDQ-NEXT: retq + %tmp = tail call <4 x double> @llvm.copysign.v4f64( <4 x double> %a, <4 x double> %b ) + ret <4 x double> %tmp +} + +define <8 x double> @v8f64(<8 x double> %a, <8 x double> %b) nounwind { +; AVX512VL-LABEL: v8f64: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: retq +; +; AVX512VLDQ-LABEL: v8f64: +; AVX512VLDQ: ## BB#0: +; AVX512VLDQ-NEXT: vandpd {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; AVX512VLDQ-NEXT: vandpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512VLDQ-NEXT: vorpd %zmm1, %zmm0, %zmm0 +; AVX512VLDQ-NEXT: retq + %tmp = tail call <8 x double> @llvm.copysign.v8f64( <8 x double> %a, <8 x double> %b ) + ret <8 x double> %tmp +} + +declare <4 x float> @llvm.copysign.v4f32(<4 x float> %Mag, <4 x float> %Sgn) +declare <8 x float> @llvm.copysign.v8f32(<8 x float> %Mag, <8 x float> %Sgn) +declare <16 x float> @llvm.copysign.v16f32(<16 x float> %Mag, <16 x float> %Sgn) +declare <2 x double> @llvm.copysign.v2f64(<2 x double> %Mag, <2 x double> %Sgn) +declare <4 x double> @llvm.copysign.v4f64(<4 x double> %Mag, <4 x double> %Sgn) +declare <8 x double> @llvm.copysign.v8f64(<8 x double> %Mag, <8 x double> %Sgn) + Index: llvm/trunk/test/Transforms/SLPVectorizer/X86/fcopysign.ll =================================================================== --- llvm/trunk/test/Transforms/SLPVectorizer/X86/fcopysign.ll +++ llvm/trunk/test/Transforms/SLPVectorizer/X86/fcopysign.ll @@ -100,16 +100,23 @@ ; SSE-NEXT: store <2 x double> [[TMP12]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 4 ; SSE-NEXT: ret void ; -; AVX-LABEL: @fcopysign_8f64( -; AVX-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4 -; AVX-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4 -; AVX-NEXT: [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4 -; AVX-NEXT: [[TMP5:%.*]] = call <4 x double> @llvm.copysign.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <4 x double> @llvm.copysign.v4f64(<4 x double> [[TMP2]], <4 x double> [[TMP4]]) -; AVX-NEXT: store <4 x double> [[TMP5]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4 -; AVX-NEXT: store <4 x double> [[TMP6]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4 -; AVX-NEXT: ret void +; AVX256-LABEL: @fcopysign_8f64( +; AVX256-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 4 +; AVX256-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4 +; AVX256-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4 +; AVX256-NEXT: [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4 +; AVX256-NEXT: [[TMP5:%.*]] = call <4 x double> @llvm.copysign.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP3]]) +; AVX256-NEXT: [[TMP6:%.*]] = call <4 x double> @llvm.copysign.v4f64(<4 x double> [[TMP2]], <4 x double> [[TMP4]]) +; AVX256-NEXT: store <4 x double> [[TMP5]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4 +; AVX256-NEXT: store <4 x double> [[TMP6]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4 +; AVX256-NEXT: ret void +; +; AVX512-LABEL: @fcopysign_8f64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @srcA64 to <8 x double>*), align 4 +; AVX512-NEXT: [[TMP2:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @srcB64 to <8 x double>*), align 4 +; AVX512-NEXT: [[TMP3:%.*]] = call <8 x double> @llvm.copysign.v8f64(<8 x double> [[TMP1]], <8 x double> [[TMP2]]) +; AVX512-NEXT: store <8 x double> [[TMP3]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 4 +; AVX512-NEXT: ret void ; %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 4 %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 4 @@ -247,16 +254,23 @@ ; SSE-NEXT: store <4 x float> [[TMP12]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 ; SSE-NEXT: ret void ; -; AVX-LABEL: @fcopysign_16f32( -; AVX-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4 -; AVX-NEXT: [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4 -; AVX-NEXT: [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4 -; AVX-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.copysign.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.copysign.v8f32(<8 x float> [[TMP2]], <8 x float> [[TMP4]]) -; AVX-NEXT: store <8 x float> [[TMP5]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 -; AVX-NEXT: store <8 x float> [[TMP6]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 -; AVX-NEXT: ret void +; AVX256-LABEL: @fcopysign_16f32( +; AVX256-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4 +; AVX256-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4 +; AVX256-NEXT: [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4 +; AVX256-NEXT: [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4 +; AVX256-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.copysign.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP3]]) +; AVX256-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.copysign.v8f32(<8 x float> [[TMP2]], <8 x float> [[TMP4]]) +; AVX256-NEXT: store <8 x float> [[TMP5]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX256-NEXT: store <8 x float> [[TMP6]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 +; AVX256-NEXT: ret void +; +; AVX512-LABEL: @fcopysign_16f32( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x float>, <16 x float>* bitcast ([16 x float]* @srcA32 to <16 x float>*), align 4 +; AVX512-NEXT: [[TMP2:%.*]] = load <16 x float>, <16 x float>* bitcast ([16 x float]* @srcB32 to <16 x float>*), align 4 +; AVX512-NEXT: [[TMP3:%.*]] = call <16 x float> @llvm.copysign.v16f32(<16 x float> [[TMP1]], <16 x float> [[TMP2]]) +; AVX512-NEXT: store <16 x float> [[TMP3]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4 +; AVX512-NEXT: ret void ; %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4 %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4