Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -26433,10 +26433,194 @@ return SDValue(); } +/// When the operands of vector mul are extended from smaller size values, +/// like i8 and i16, the type of mul may be shrinked to generate more +/// efficient code. Two typical patterns are handled: +/// Pattern1: +/// %2 = zext/sext %1 to +/// %4 = zext/sext %3 to +/// %5 = mul %2, %4 +/// +/// Pattern2: +/// %2 = zext/sext %1 to +/// %4 = zext/sext %3 to +/// %5 = mul %2, %4 +/// +/// For pattern1, mul can be shrinked to mul . +/// For pattern2, mul can be shrinked to mul +/// plus mulhs/mulhw . +static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // pmulld is supported since SSE41. It is better to use pmulld + // instead of pmullw+pmulhw. + if (Subtarget.hasSSE41()) + return SDValue(); + + SDLoc DL(N); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N0.getValueType(); + + if (VT.getScalarSizeInBits() != 32) + return SDValue(); + + enum { + DONTCARE, + SIZEI8, + SIZEI16, + }; + int N0Size = DONTCARE, N1Size = DONTCARE; + bool IsSigned = false; + // Find out the smaller size of N0 before it is sign/zero-extended. + if (N0.getOpcode() == ISD::ZERO_EXTEND || + N0.getOpcode() == ISD::SIGN_EXTEND) { + IsSigned = (N0.getOpcode() == ISD::SIGN_EXTEND) ? true : false; + if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::i8) + N0Size = SIZEI8; + else if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::i16) + N0Size = SIZEI16; + } else { + return SDValue(); + } + + // Find out the smaller size of N1 before it is sign/zero-extended. + if (N1.getOpcode() == ISD::ZERO_EXTEND || + N1.getOpcode() == ISD::SIGN_EXTEND) { + if (N1.getOperand(0).getValueType().getVectorElementType() == MVT::i8) + N1Size = SIZEI8; + else if (N1.getOperand(0).getValueType().getVectorElementType() == MVT::i16) + N1Size = SIZEI16; + } + + // Find out the size of N1 if it is a constant splat. + APInt SplatValue, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + auto *BV = dyn_cast(N1); + if (BV && + BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs)) { + // Check ConstVal is an i8, i16 or i32. + if (SplatBitSize == 8) + N1Size = SIZEI8; + else if (SplatBitSize == 16) + N1Size = SIZEI16; + } + + if (N0Size == DONTCARE || N1Size == DONTCARE) + return SDValue(); + + // Current mul can be shrinked. + int OpsSize = std::max(N0Size, N1Size); + unsigned RegSize = 128; + MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16); + EVT ReducedVT = + EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements()); + // Shrink the operands of mul. + SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0); + SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1); + + // Handle the case when split is needed. + if (VT.getVectorNumElements() > OpsVT.getVectorNumElements()) { + // Split the nodes. + unsigned SplitNum = + VT.getVectorNumElements() / OpsVT.getVectorNumElements(); + SmallVector MulRes(SplitNum); + + MVT ExtVT = MVT::getVectorVT(MVT::i32, OpsVT.getVectorNumElements()); + for (unsigned i = 0; i < SplitNum; i++) { + unsigned ExtractPos = i * OpsVT.getVectorNumElements(); + SDValue SubN0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OpsVT, NewN0, + DAG.getIntPtrConstant(ExtractPos, DL)); + SDValue SubN1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OpsVT, NewN1, + DAG.getIntPtrConstant(ExtractPos, DL)); + SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, SubN0, SubN1); + if (OpsSize == SIZEI8) { + MulRes[i] = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, + DL, ExtVT, MulLo); + } else { + MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32); + SDValue MulHi = DAG.getNode(IsSigned ? ISD::MULHS : ISD::MULHU, DL, + OpsVT, SubN0, SubN1); + SDValue ResLo = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi); + ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo); + SDValue ResHi = DAG.getNode(X86ISD::UNPCKH, DL, OpsVT, MulLo, MulHi); + ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi); + MulRes[i] = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtVT, ResLo, ResHi); + } + } + // Concat the splitted MulRes results. + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, MulRes); + } + + // Now handle the cases when no split is needed. + // i.e., VT.getVectorNumElements() <= OpsVT.getVectorNumElements() + // + // Legalize the operands of mul. + SmallVector Ops(RegSize / ReducedVT.getSizeInBits(), + DAG.getUNDEF(ReducedVT)); + Ops[0] = NewN0; + NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops); + Ops[0] = NewN1; + NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops); + + if (OpsSize == SIZEI8) { + // For "mul N0, N1" where N0 and N1 are extended from , + // shrink mul type to "mul N0, N1". + // + // Generate new nodes of mul + SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1); + + // Convert the type of mul result to VT. + if (VT.getVectorNumElements() < OpsVT.getVectorNumElements()) { + MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32); + SDValue Res = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG + : ISD::ZERO_EXTEND_VECTOR_INREG, + DL, ResVT, Mul); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, + DAG.getIntPtrConstant(0, DL)); + } else { + // The case when VT.getVectorNumElements() == OpsVT.getVectorNumElements() + return DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, + Mul); + } + } else { + // OpsSize == SIZEI16 + // For "mul N0, N1" where N0 and N1 are extended from + // , try to shrink "mul N0, N1" to + // "mul N0, N1" plus "mulhs/mulhu N0, N1". + // + // Generate new nodes of mul and mulhs/mulhu + SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1); + SDValue MulHi = DAG.getNode(IsSigned ? ISD::MULHS : ISD::MULHU, DL, OpsVT, + NewN0, NewN1); + + // Repack the results of mul+mulhs/mulhw and create a VT type node. + MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32); + if (VT.getVectorNumElements() < OpsVT.getVectorNumElements()) { + SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi); + Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, + DAG.getIntPtrConstant(0, DL)); + } else { + // The case when VT.getVectorNumElements() == OpsVT.getVectorNumElements() + SDValue ResLo = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi); + ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo); + SDValue ResHi = DAG.getNode(X86ISD::UNPCKH, DL, OpsVT, MulLo, MulHi); + ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi); + } + } +} + /// Optimize a single multiply with constant into two operations in order to /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA. static SDValue combineMul(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + EVT VT = N->getValueType(0); + if (DCI.isBeforeLegalize() && VT.isVector()) + return reduceVMULWidth(N, DAG, Subtarget); + // An imul is usually smaller than the alternative sequence. if (DAG.getMachineFunction().getFunction()->optForMinSize()) return SDValue(); @@ -26444,7 +26628,6 @@ if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); - EVT VT = N->getValueType(0); if (VT != MVT::i64 && VT != MVT::i32) return SDValue(); @@ -29590,7 +29773,8 @@ case ISD::ADD: return combineAdd(N, DAG, Subtarget); case ISD::SUB: return combineSub(N, DAG, Subtarget); case X86ISD::ADC: return combineADC(N, DAG, DCI); - case ISD::MUL: return combineMul(N, DAG, DCI); + case ISD::MUL: + return combineMul(N, DAG, DCI, Subtarget); case ISD::SHL: case ISD::SRA: case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget); Index: test/CodeGen/X86/shrink_vmul.ll =================================================================== --- test/CodeGen/X86/shrink_vmul.ll +++ test/CodeGen/X86/shrink_vmul.ll @@ -0,0 +1,418 @@ +; NOTE: Assertions have been autogenerated by update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s + +@c = external global i32*, align 8 + +define void @mul_2xi8(i8* nocapture readonly %d, i32 %N) { +; CHECK-LABEL: mul_2xi8: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: pxor %xmm0, %xmm0 +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB0_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movzwl (%rdi,%rcx), %edx +; CHECK-NEXT: movd %edx, %xmm1 +; CHECK-NEXT: movzwl 1(%rdi,%rcx), %edx +; CHECK-NEXT: movd %edx, %xmm2 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; CHECK-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; CHECK-NEXT: pmullw %xmm1, %xmm2 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; CHECK-NEXT: movq %xmm2, (%rax,%rcx,4) +; CHECK-NEXT: addq $2, %rcx +; CHECK-NEXT: cmpq $1024, %rcx # imm = 0x400 +; CHECK-NEXT: jne .LBB0_1 +; CHECK-NEXT: # BB#2: # %middle.block +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + br label %vector.body + +vector.body: + %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] + %tmp6 = getelementptr inbounds i8, i8* %d, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = zext <2 x i8> %wide.load to <2 x i32> + %tmp9 = or i64 %index, 1 + %tmp10 = getelementptr inbounds i8, i8* %d, i64 %tmp9 + %tmp11 = bitcast i8* %tmp10 to <2 x i8>* + %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 + %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + %index.next = add i64 %index, 2 + %tmp16 = icmp eq i64 %index.next, 1024 + br i1 %tmp16, label %middle.block, label %vector.body + +middle.block: + ret void +} + +define void @mul_4xi8(i8* nocapture readonly %d, i32 %N) { +; CHECK-LABEL: mul_4xi8: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: pxor %xmm0, %xmm0 +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB1_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; CHECK-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; CHECK-NEXT: pmullw %xmm1, %xmm2 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; CHECK-NEXT: movdqu %xmm2, (%rax,%rcx,4) +; CHECK-NEXT: addq $4, %rcx +; CHECK-NEXT: cmpq $1024, %rcx # imm = 0x400 +; CHECK-NEXT: jne .LBB1_1 +; CHECK-NEXT: # BB#2: # %middle.block +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + br label %vector.body + +vector.body: + %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] + %tmp6 = getelementptr inbounds i8, i8* %d, i64 %index + %tmp7 = bitcast i8* %tmp6 to <4 x i8>* + %wide.load = load <4 x i8>, <4 x i8>* %tmp7, align 1 + %tmp8 = zext <4 x i8> %wide.load to <4 x i32> + %tmp9 = or i64 %index, 1 + %tmp10 = getelementptr inbounds i8, i8* %d, i64 %tmp9 + %tmp11 = bitcast i8* %tmp10 to <4 x i8>* + %wide.load17 = load <4 x i8>, <4 x i8>* %tmp11, align 1 + %tmp12 = zext <4 x i8> %wide.load17 to <4 x i32> + %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <4 x i32>* + store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4 + %index.next = add i64 %index, 4 + %tmp16 = icmp eq i64 %index.next, 1024 + br i1 %tmp16, label %middle.block, label %vector.body + +middle.block: + ret void +} + +define void @mul_8xi8(i8* nocapture readonly %d, i32 %N) { +; CHECK-LABEL: mul_8xi8: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: pxor %xmm0, %xmm0 +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB2_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; CHECK-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; CHECK-NEXT: pmullw %xmm1, %xmm2 +; CHECK-NEXT: movdqa %xmm2, %xmm1 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; CHECK-NEXT: movdqu %xmm2, 16(%rax,%rcx,4) +; CHECK-NEXT: movdqu %xmm1, (%rax,%rcx,4) +; CHECK-NEXT: addq $8, %rcx +; CHECK-NEXT: cmpq $1024, %rcx # imm = 0x400 +; CHECK-NEXT: jne .LBB2_1 +; CHECK-NEXT: # BB#2: # %middle.block +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + br label %vector.body + +vector.body: + %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] + %tmp6 = getelementptr inbounds i8, i8* %d, i64 %index + %tmp7 = bitcast i8* %tmp6 to <8 x i8>* + %wide.load = load <8 x i8>, <8 x i8>* %tmp7, align 1 + %tmp8 = zext <8 x i8> %wide.load to <8 x i32> + %tmp9 = or i64 %index, 1 + %tmp10 = getelementptr inbounds i8, i8* %d, i64 %tmp9 + %tmp11 = bitcast i8* %tmp10 to <8 x i8>* + %wide.load17 = load <8 x i8>, <8 x i8>* %tmp11, align 1 + %tmp12 = zext <8 x i8> %wide.load17 to <8 x i32> + %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <8 x i32>* + store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4 + %index.next = add i64 %index, 8 + %tmp16 = icmp eq i64 %index.next, 1024 + br i1 %tmp16, label %middle.block, label %vector.body + +middle.block: + ret void +} + +define void @mul_16xi8(i8* nocapture readonly %d, i32 %N) { +; CHECK-LABEL: mul_16xi8: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: pxor %xmm0, %xmm0 +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB3_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movdqu (%rdi,%rcx), %xmm1 +; CHECK-NEXT: movdqu 1(%rdi,%rcx), %xmm2 +; CHECK-NEXT: movdqa %xmm1, %xmm3 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; CHECK-NEXT: movdqa %xmm2, %xmm4 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; CHECK-NEXT: pmullw %xmm3, %xmm4 +; CHECK-NEXT: movdqa %xmm4, %xmm3 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; CHECK-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; CHECK-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; CHECK-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; CHECK-NEXT: pmullw %xmm1, %xmm2 +; CHECK-NEXT: movdqa %xmm2, %xmm1 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; CHECK-NEXT: movdqu %xmm2, 48(%rax,%rcx,4) +; CHECK-NEXT: movdqu %xmm1, 32(%rax,%rcx,4) +; CHECK-NEXT: movdqu %xmm4, 16(%rax,%rcx,4) +; CHECK-NEXT: movdqu %xmm3, (%rax,%rcx,4) +; CHECK-NEXT: addq $16, %rcx +; CHECK-NEXT: cmpq $1024, %rcx # imm = 0x400 +; CHECK-NEXT: jne .LBB3_1 +; CHECK-NEXT: # BB#2: # %middle.block +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + br label %vector.body + +vector.body: + %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] + %tmp6 = getelementptr inbounds i8, i8* %d, i64 %index + %tmp7 = bitcast i8* %tmp6 to <16 x i8>* + %wide.load = load <16 x i8>, <16 x i8>* %tmp7, align 1 + %tmp8 = zext <16 x i8> %wide.load to <16 x i32> + %tmp9 = or i64 %index, 1 + %tmp10 = getelementptr inbounds i8, i8* %d, i64 %tmp9 + %tmp11 = bitcast i8* %tmp10 to <16 x i8>* + %wide.load17 = load <16 x i8>, <16 x i8>* %tmp11, align 1 + %tmp12 = zext <16 x i8> %wide.load17 to <16 x i32> + %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <16 x i32>* + store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 + %index.next = add i64 %index, 16 + %tmp16 = icmp eq i64 %index.next, 1024 + br i1 %tmp16, label %middle.block, label %vector.body + +middle.block: + ret void +} + +define void @mul_2xi16(i16* nocapture readonly %d, i32 %N) { +; CHECK-LABEL: mul_2xi16: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB4_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: pmulhuw %xmm0, %xmm2 +; CHECK-NEXT: pmullw %xmm0, %xmm1 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; CHECK-NEXT: movq %xmm1, (%rax,%rcx,4) +; CHECK-NEXT: addq $2, %rcx +; CHECK-NEXT: cmpq $1024, %rcx # imm = 0x400 +; CHECK-NEXT: jne .LBB4_1 +; CHECK-NEXT: # BB#2: # %middle.block +; CHECK-NEXT: retq +entry: + %tmp = load i32*, i32** @c + br label %vector.body + +vector.body: + %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] + %tmp4 = getelementptr inbounds i16, i16* %d, i64 %index + %tmp5 = bitcast i16* %tmp4 to <2 x i16>* + %wide.load = load <2 x i16>, <2 x i16>* %tmp5, align 2 + %tmp6 = zext <2 x i16> %wide.load to <2 x i32> + %tmp7 = add i64 %index, 2 + %tmp8 = getelementptr inbounds i16, i16* %d, i64 %tmp7 + %tmp9 = bitcast i16* %tmp8 to <2 x i16>* + %wide.load15 = load <2 x i16>, <2 x i16>* %tmp9, align 2 + %tmp10 = zext <2 x i16> %wide.load15 to <2 x i32> + %tmp11 = mul nuw nsw <2 x i32> %tmp10, %tmp6 + %tmp12 = getelementptr inbounds i32, i32* %tmp, i64 %index + %tmp13 = bitcast i32* %tmp12 to <2 x i32>* + store <2 x i32> %tmp11, <2 x i32>* %tmp13, align 4 + %index.next = add i64 %index, 2 + %tmp14 = icmp eq i64 %index.next, 1024 + br i1 %tmp14, label %middle.block, label %vector.body + +middle.block: + ret void +} + +define void @mul_4xi16(i16* nocapture readonly %d, i32 %N) { +; CHECK-LABEL: mul_4xi16: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB5_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: pmulhuw %xmm0, %xmm2 +; CHECK-NEXT: pmullw %xmm0, %xmm1 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; CHECK-NEXT: movdqu %xmm1, (%rax,%rcx,4) +; CHECK-NEXT: addq $4, %rcx +; CHECK-NEXT: cmpq $1024, %rcx # imm = 0x400 +; CHECK-NEXT: jne .LBB5_1 +; CHECK-NEXT: # BB#2: # %middle.block +; CHECK-NEXT: retq +entry: + %tmp = load i32*, i32** @c + br label %vector.body + +vector.body: + %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] + %tmp4 = getelementptr inbounds i16, i16* %d, i64 %index + %tmp5 = bitcast i16* %tmp4 to <4 x i16>* + %wide.load = load <4 x i16>, <4 x i16>* %tmp5, align 2 + %tmp6 = zext <4 x i16> %wide.load to <4 x i32> + %tmp7 = add i64 %index, 2 + %tmp8 = getelementptr inbounds i16, i16* %d, i64 %tmp7 + %tmp9 = bitcast i16* %tmp8 to <4 x i16>* + %wide.load15 = load <4 x i16>, <4 x i16>* %tmp9, align 2 + %tmp10 = zext <4 x i16> %wide.load15 to <4 x i32> + %tmp11 = mul nuw nsw <4 x i32> %tmp10, %tmp6 + %tmp12 = getelementptr inbounds i32, i32* %tmp, i64 %index + %tmp13 = bitcast i32* %tmp12 to <4 x i32>* + store <4 x i32> %tmp11, <4 x i32>* %tmp13, align 4 + %index.next = add i64 %index, 4 + %tmp14 = icmp eq i64 %index.next, 1024 + br i1 %tmp14, label %middle.block, label %vector.body + +middle.block: + ret void +} + +define void @mul_8xi16(i16* nocapture readonly %d, i32 %N) { +; CHECK-LABEL: mul_8xi16: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB6_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movdqu (%rdi,%rcx,2), %xmm0 +; CHECK-NEXT: movdqu 4(%rdi,%rcx,2), %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: pmulhuw %xmm0, %xmm2 +; CHECK-NEXT: pmullw %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; CHECK-NEXT: movdqu %xmm1, 16(%rax,%rcx,4) +; CHECK-NEXT: movdqu %xmm0, (%rax,%rcx,4) +; CHECK-NEXT: addq $8, %rcx +; CHECK-NEXT: cmpq $1024, %rcx # imm = 0x400 +; CHECK-NEXT: jne .LBB6_1 +; CHECK-NEXT: # BB#2: # %middle.block +; CHECK-NEXT: retq +entry: + %tmp = load i32*, i32** @c + br label %vector.body + +vector.body: + %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] + %tmp4 = getelementptr inbounds i16, i16* %d, i64 %index + %tmp5 = bitcast i16* %tmp4 to <8 x i16>* + %wide.load = load <8 x i16>, <8 x i16>* %tmp5, align 2 + %tmp6 = zext <8 x i16> %wide.load to <8 x i32> + %tmp7 = add i64 %index, 2 + %tmp8 = getelementptr inbounds i16, i16* %d, i64 %tmp7 + %tmp9 = bitcast i16* %tmp8 to <8 x i16>* + %wide.load15 = load <8 x i16>, <8 x i16>* %tmp9, align 2 + %tmp10 = zext <8 x i16> %wide.load15 to <8 x i32> + %tmp11 = mul nuw nsw <8 x i32> %tmp10, %tmp6 + %tmp12 = getelementptr inbounds i32, i32* %tmp, i64 %index + %tmp13 = bitcast i32* %tmp12 to <8 x i32>* + store <8 x i32> %tmp11, <8 x i32>* %tmp13, align 4 + %index.next = add i64 %index, 8 + %tmp14 = icmp eq i64 %index.next, 1024 + br i1 %tmp14, label %middle.block, label %vector.body + +middle.block: + ret void +} + +define void @mul_16xi16(i16* nocapture readonly %d, i32 %N) { +; CHECK-LABEL: mul_16xi16: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB7_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movdqu 16(%rdi,%rcx,2), %xmm0 +; CHECK-NEXT: movdqu (%rdi,%rcx,2), %xmm1 +; CHECK-NEXT: movdqu 20(%rdi,%rcx,2), %xmm2 +; CHECK-NEXT: movdqu 4(%rdi,%rcx,2), %xmm3 +; CHECK-NEXT: movdqa %xmm3, %xmm4 +; CHECK-NEXT: pmulhuw %xmm1, %xmm4 +; CHECK-NEXT: pmullw %xmm1, %xmm3 +; CHECK-NEXT: movdqa %xmm3, %xmm1 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; CHECK-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; CHECK-NEXT: movdqa %xmm2, %xmm4 +; CHECK-NEXT: pmulhuw %xmm0, %xmm4 +; CHECK-NEXT: pmullw %xmm0, %xmm2 +; CHECK-NEXT: movdqa %xmm2, %xmm0 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; CHECK-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; CHECK-NEXT: movdqu %xmm2, 48(%rax,%rcx,4) +; CHECK-NEXT: movdqu %xmm0, 32(%rax,%rcx,4) +; CHECK-NEXT: movdqu %xmm3, 16(%rax,%rcx,4) +; CHECK-NEXT: movdqu %xmm1, (%rax,%rcx,4) +; CHECK-NEXT: addq $16, %rcx +; CHECK-NEXT: cmpq $1024, %rcx # imm = 0x400 +; CHECK-NEXT: jne .LBB7_1 +; CHECK-NEXT: # BB#2: # %middle.block +; CHECK-NEXT: retq +entry: + %tmp = load i32*, i32** @c + br label %vector.body + +vector.body: + %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] + %tmp4 = getelementptr inbounds i16, i16* %d, i64 %index + %tmp5 = bitcast i16* %tmp4 to <16 x i16>* + %wide.load = load <16 x i16>, <16 x i16>* %tmp5, align 2 + %tmp6 = zext <16 x i16> %wide.load to <16 x i32> + %tmp7 = add i64 %index, 2 + %tmp8 = getelementptr inbounds i16, i16* %d, i64 %tmp7 + %tmp9 = bitcast i16* %tmp8 to <16 x i16>* + %wide.load15 = load <16 x i16>, <16 x i16>* %tmp9, align 2 + %tmp10 = zext <16 x i16> %wide.load15 to <16 x i32> + %tmp11 = mul nuw nsw <16 x i32> %tmp10, %tmp6 + %tmp12 = getelementptr inbounds i32, i32* %tmp, i64 %index + %tmp13 = bitcast i32* %tmp12 to <16 x i32>* + store <16 x i32> %tmp11, <16 x i32>* %tmp13, align 4 + %index.next = add i64 %index, 16 + %tmp14 = icmp eq i64 %index.next, 1024 + br i1 %tmp14, label %middle.block, label %vector.body + +middle.block: + ret void +}