Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -802,6 +802,7 @@ setOperationAction(ISD::ADD, MVT::v8i16, Legal); setOperationAction(ISD::ADD, MVT::v4i32, Legal); setOperationAction(ISD::ADD, MVT::v2i64, Legal); + setOperationAction(ISD::MUL, MVT::v16i8, Custom); setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom); @@ -1122,7 +1123,7 @@ setOperationAction(ISD::MUL, MVT::v4i64, Custom); setOperationAction(ISD::MUL, MVT::v8i32, Legal); setOperationAction(ISD::MUL, MVT::v16i16, Legal); - // Don't lower v32i8 because there is no 128-bit byte mul + setOperationAction(ISD::MUL, MVT::v32i8, Custom); setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom); setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom); @@ -1171,7 +1172,7 @@ setOperationAction(ISD::MUL, MVT::v4i64, Custom); setOperationAction(ISD::MUL, MVT::v8i32, Custom); setOperationAction(ISD::MUL, MVT::v16i16, Custom); - // Don't lower v32i8 because there is no 128-bit byte mul + setOperationAction(ISD::MUL, MVT::v32i8, Custom); } // In the customized shift lowering, the legal cases in AVX2 will be @@ -9878,7 +9879,7 @@ int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [NumElts](int M) { return M >= NumElts; }); - + if (NumV2Elements == 1 && Mask[0] >= NumElts) if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( DL, VT, V1, V2, Mask, Subtarget, DAG)) @@ -10630,7 +10631,7 @@ return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2); } } - + // Get the desired 128-bit vector chunk. SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl); @@ -15892,6 +15893,32 @@ SDValue A = Op.getOperand(0); SDValue B = Op.getOperand(1); + // Lower v16i8/v32i8 mul as promotion to v8i16/v16i16 vector + // pairs, multiply and truncate. + if (VT == MVT::v16i8 || VT == MVT::v32i8) { + MVT ExVT = (VT == MVT::v16i8 ? MVT::v8i16 : MVT::v16i16); + // Extract the lo parts, sign extend to i16 and multiply + SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, A, A); + SDValue BLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, B, B); + ALo = DAG.getNode(ISD::BITCAST, dl, ExVT, ALo); + BLo = DAG.getNode(ISD::BITCAST, dl, ExVT, BLo); + ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, ExVT)); + BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, ExVT)); + SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo); + // Extract the hi parts, sign extend to i16 and multiply + SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, A, A); + SDValue BHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, B, B); + AHi = DAG.getNode(ISD::BITCAST, dl, ExVT, AHi); + BHi = DAG.getNode(ISD::BITCAST, dl, ExVT, BHi); + AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, ExVT)); + BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, ExVT)); + SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi); + // Mask the lower 8bits of the lo/hi results and pack + RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, ExVT)); + RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, ExVT)); + return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi); + } + // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle. if (VT == MVT::v4i32) { assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() && Index: test/CodeGen/X86/avx2-arith.ll =================================================================== --- test/CodeGen/X86/avx2-arith.ll +++ test/CodeGen/X86/avx2-arith.ll @@ -60,6 +60,26 @@ ret <16 x i16> %x } +; CHECK: mul-v32i8 +; CHECK: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-NEXT: vpsraw $8, %ymm2, %ymm2 +; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-NEXT: vpsraw $8, %ymm3, %ymm3 +; CHECK-NEXT: vpmullw %ymm2, %ymm3, %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-NEXT: vpand %ymm3, %ymm2, %ymm2 +; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; CHECK-NEXT: vpsraw $8, %ymm1, %ymm1 +; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; CHECK-NEXT: vpsraw $8, %ymm0, %ymm0 +; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpand %ymm3, %ymm0, %ymm0 +; CHECK-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +define <32 x i8> @mul-v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone { + %x = mul <32 x i8> %i, %j + ret <32 x i8> %x +} + ; CHECK: mul-v4i64 ; CHECK: vpmuludq %ymm ; CHECK-NEXT: vpsrlq $32, %ymm Index: test/CodeGen/X86/pmul.ll =================================================================== --- test/CodeGen/X86/pmul.ll +++ test/CodeGen/X86/pmul.ll @@ -1,6 +1,42 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE41 +define <16 x i8> @mul8c(<16 x i8> %i) nounwind { +; ALL-LABEL: mul8c: +; ALL: # BB#0: # %entry +; ALL-NEXT: movdqa {{.*#+}} xmm1 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] +; ALL-NEXT: movdqa %xmm1, %xmm2 +; ALL-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; ALL-NEXT: psraw $8, %xmm2 +; ALL-NEXT: movdqa %xmm0, %xmm3 +; ALL-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; ALL-NEXT: psraw $8, %xmm3 +; ALL-NEXT: pmullw %xmm2, %xmm3 +; ALL-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; ALL-NEXT: pand %xmm2, %xmm3 +; ALL-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; ALL-NEXT: psraw $8, %xmm1 +; ALL-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; ALL-NEXT: psraw $8, %xmm0 +; ALL-NEXT: pmullw %xmm1, %xmm0 +; ALL-NEXT: pand %xmm2, %xmm0 +; ALL-NEXT: packuswb %xmm3, %xmm0 +; ALL-NEXT: retq +entry: + %A = mul <16 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 > + ret <16 x i8> %A +} + +define <8 x i16> @mul16c(<8 x i16> %i) nounwind { +; ALL-LABEL: mul16c: +; ALL: # BB#0: # %entry +; ALL-NEXT: pmullw {{.*}}(%rip), %xmm0 +; ALL-NEXT: retq +entry: + %A = mul <8 x i16> %i, < i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117 > + ret <8 x i16> %A +} + define <4 x i32> @a(<4 x i32> %i) nounwind { ; SSE2-LABEL: a: ; SSE2: # BB#0: # %entry @@ -42,6 +78,41 @@ ret <2 x i64> %A } +define <16 x i8> @mul8(<16 x i8> %i, <16 x i8> %j) nounwind { +; ALL-LABEL: mul8: +; ALL: # BB#0: # %entry +; ALL-NEXT: movdqa %xmm1, %xmm2 +; ALL-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; ALL-NEXT: psraw $8, %xmm2 +; ALL-NEXT: movdqa %xmm0, %xmm3 +; ALL-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; ALL-NEXT: psraw $8, %xmm3 +; ALL-NEXT: pmullw %xmm2, %xmm3 +; ALL-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; ALL-NEXT: pand %xmm2, %xmm3 +; ALL-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; ALL-NEXT: psraw $8, %xmm1 +; ALL-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; ALL-NEXT: psraw $8, %xmm0 +; ALL-NEXT: pmullw %xmm1, %xmm0 +; ALL-NEXT: pand %xmm2, %xmm0 +; ALL-NEXT: packuswb %xmm3, %xmm0 +; ALL-NEXT: retq +entry: + %A = mul <16 x i8> %i, %j + ret <16 x i8> %A +} + +define <8 x i16> @mul16(<8 x i16> %i, <8 x i16> %j) nounwind { +; ALL-LABEL: mul16: +; ALL: # BB#0: # %entry +; ALL-NEXT: pmullw %xmm1, %xmm0 +; ALL-NEXT: retq +entry: + %A = mul <8 x i16> %i, %j + ret <8 x i16> %A +} + define <4 x i32> @c(<4 x i32> %i, <4 x i32> %j) nounwind { ; SSE2-LABEL: c: ; SSE2: # BB#0: # %entry