Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -19854,6 +19854,24 @@ assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"); + // AVX512DQ - extend to 512 bit vector. + if (Subtarget.hasDQI()) { + assert(!Subtarget.hasVLX() && "AVX512DQVL vXi64 multiply is legal"); + assert((VT == MVT::v2i64 || VT == MVT::v4i64) && + "AVX512DQ v8i64 multiply is legal"); + + MVT NewVT = MVT::getVectorVT(MVT::i64, 512 / VT.getScalarSizeInBits()); + SDValue A512 = + DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT, DAG.getUNDEF(NewVT), A, + DAG.getIntPtrConstant(0, dl)); + SDValue B512 = + DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT, DAG.getUNDEF(NewVT), B, + DAG.getIntPtrConstant(0, dl)); + SDValue MulNode = DAG.getNode(ISD::MUL, dl, NewVT, A512, B512); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, MulNode, + DAG.getIntPtrConstant(0, dl)); + } + // Ahi = psrlqi(a, 32); // Bhi = psrlqi(b, 32); // Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -204,6 +204,19 @@ return LT.first * Entry->Cost; } + static const CostTblEntry AVX512DQCostTable[] = { + { ISD::MUL, MVT::v2i64, 1 }, + { ISD::MUL, MVT::v4i64, 1 }, + { ISD::MUL, MVT::v8i64, 1 } + }; + + // Look for AVX512DQ lowering tricks for custom cases. + if (ST->hasDQI()) { + if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, + LT.second)) + return LT.first * Entry->Cost; + } + static const CostTblEntry AVX512BWCostTable[] = { // Vectorizing division is a bad idea. See the SSE2 table for more comments. { ISD::SDIV, MVT::v64i8, 64*20 }, Index: test/Analysis/CostModel/X86/arith.ll =================================================================== --- test/Analysis/CostModel/X86/arith.ll +++ test/Analysis/CostModel/X86/arith.ll @@ -4,6 +4,7 @@ ; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 ; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F ; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512DQ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" @@ -429,19 +430,25 @@ ; SSE42: cost of 9 {{.*}} %A = mul ; AVX: cost of 9 {{.*}} %A = mul ; AVX2: cost of 9 {{.*}} %A = mul - ; AVX512: cost of 9 {{.*}} %A = mul + ; AVX512F: cost of 9 {{.*}} %A = mul + ; AVX512BW: cost of 9 {{.*}} %A = mul + ; AVX512DQ: cost of 1 {{.*}} %A = mul %A = mul <2 x i64> undef, undef ; SSSE3: cost of 18 {{.*}} %B = mul ; SSE42: cost of 18 {{.*}} %B = mul ; AVX: cost of 18 {{.*}} %B = mul ; AVX2: cost of 9 {{.*}} %B = mul - ; AVX512: cost of 9 {{.*}} %B = mul + ; AVX512F: cost of 9 {{.*}} %B = mul + ; AVX512BW: cost of 9 {{.*}} %B = mul + ; AVX512DQ: cost of 1 {{.*}} %B = mul %B = mul <4 x i64> undef, undef ; SSSE3: cost of 36 {{.*}} %C = mul ; SSE42: cost of 36 {{.*}} %C = mul ; AVX: cost of 36 {{.*}} %C = mul ; AVX2: cost of 18 {{.*}} %C = mul - ; AVX512: cost of 2 {{.*}} %C = mul + ; AVX512F: cost of 2 {{.*}} %C = mul + ; AVX512BW: cost of 2 {{.*}} %C = mul + ; AVX512DQ: cost of 1 {{.*}} %C = mul %C = mul <8 x i64> undef, undef ; SSSE3: cost of 6 {{.*}} %D = mul @@ -515,7 +522,9 @@ ; SSE42: cost of 9 {{.*}} %A0 = mul ; AVX: cost of 9 {{.*}} %A0 = mul ; AVX2: cost of 9 {{.*}} %A0 = mul - ; AVX512: cost of 9 {{.*}} %A0 = mul + ; AVX512F: cost of 9 {{.*}} %A0 = mul + ; AVX512BW: cost of 9 {{.*}} %A0 = mul + ; AVX512DQ: cost of 1 {{.*}} %A0 = mul %A0 = mul <2 x i32> undef, undef ret void Index: test/CodeGen/X86/avx512-arith.ll =================================================================== --- test/CodeGen/X86/avx512-arith.ll +++ test/CodeGen/X86/avx512-arith.ll @@ -182,15 +182,10 @@ ; ; AVX512DQ-LABEL: imulq256: ; AVX512DQ: ## BB#0: -; AVX512DQ-NEXT: vpmuludq %ymm0, %ymm1, %ymm2 -; AVX512DQ-NEXT: vpsrlq $32, %ymm0, %ymm3 -; AVX512DQ-NEXT: vpmuludq %ymm3, %ymm1, %ymm3 -; AVX512DQ-NEXT: vpsllq $32, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpsrlq $32, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpmuludq %ymm0, %ymm1, %ymm0 -; AVX512DQ-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddq %ymm0, %ymm3, %ymm0 -; AVX512DQ-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX512DQ-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; AVX512DQ-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; AVX512DQ-NEXT: vpmullq %zmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512DQ-NEXT: retq ; ; SKX-LABEL: imulq256: @@ -243,15 +238,10 @@ ; ; AVX512DQ-LABEL: imulq128: ; AVX512DQ: ## BB#0: -; AVX512DQ-NEXT: vpmuludq %xmm0, %xmm1, %xmm2 -; AVX512DQ-NEXT: vpsrlq $32, %xmm0, %xmm3 -; AVX512DQ-NEXT: vpmuludq %xmm3, %xmm1, %xmm3 -; AVX512DQ-NEXT: vpsllq $32, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpsrlq $32, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 -; AVX512DQ-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpaddq %xmm0, %xmm3, %xmm0 -; AVX512DQ-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX512DQ-NEXT: ## kill: %XMM1 %XMM1 %ZMM1 +; AVX512DQ-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512DQ-NEXT: vpmullq %zmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; AVX512DQ-NEXT: retq ; ; SKX-LABEL: imulq128: