Index: lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -670,6 +670,8 @@ case ISD::ADD: case ISD::SUB: case ISD::MUL: + case ISD::MULHS: + case ISD::MULHU: case ISD::FADD: case ISD::FSUB: case ISD::FMUL: Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -26433,10 +26433,216 @@ return SDValue(); } +/// Different mul shrinking modes. +enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 }; + +static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) { + EVT VT = N->getOperand(0).getValueType(); + if (VT.getScalarSizeInBits() != 32) + return false; + + assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2"); + unsigned SignBits[2] = {1, 1}; + bool IsPositive[2] = {false, false}; + for (unsigned i = 0; i < 2; i++) { + SDValue Opd = N->getOperand(i); + + // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to + // compute signbits for it separately. + if (Opd.getOpcode() == ISD::ANY_EXTEND) { + // For anyextend, it is safe to assume an appropriate number of leading + // sign/zero bits. + if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8) + SignBits[i] = 25; + else if (Opd.getOperand(0).getValueType().getVectorElementType() == + MVT::i16) + SignBits[i] = 17; + else + return false; + IsPositive[i] = true; + } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) { + // All the operands of BUILD_VECTOR need to be int constant. + // Find the smallest value range which all the operands belong to. + SignBits[i] = 32; + IsPositive[i] = true; + for (const SDValue &SubOp : Opd.getNode()->op_values()) { + if (SubOp.isUndef()) + continue; + auto *CN = dyn_cast(SubOp); + if (!CN) + return false; + APInt IntVal = CN->getAPIntValue(); + if (IntVal.isNegative()) + IsPositive[i] = false; + SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits()); + } + } else { + SignBits[i] = DAG.ComputeNumSignBits(Opd); + if (Opd.getOpcode() == ISD::ZERO_EXTEND) + IsPositive[i] = true; + } + } + + bool AllPositive = IsPositive[0] && IsPositive[1]; + unsigned MinSignBits = std::min(SignBits[0], SignBits[1]); + // When ranges are from -128 ~ 127, use MULS8 mode. + if (MinSignBits >= 25) + Mode = MULS8; + // When ranges are from 0 ~ 255, use MULU8 mode. + else if (AllPositive && MinSignBits >= 24) + Mode = MULU8; + // When ranges are from -32768 ~ 32767, use MULS16 mode. + else if (MinSignBits >= 17) + Mode = MULS16; + // When ranges are from 0 ~ 65535, use MULU16 mode. + else if (AllPositive && MinSignBits >= 16) + Mode = MULU16; + else + return false; + return true; +} + +/// When the operands of vector mul are extended from smaller size values, +/// like i8 and i16, the type of mul may be shrinked to generate more +/// efficient code. Two typical patterns are handled: +/// Pattern1: +/// %2 = sext/zext %1 to +/// %4 = sext/zext %3 to +// or %4 = build_vector %C1, ..., %CN (%C1..%CN are constants) +/// %5 = mul %2, %4 +/// +/// Pattern2: +/// %2 = zext/sext %1 to +/// %4 = zext/sext %3 to +/// or %4 = build_vector %C1, ..., %CN (%C1..%CN are constants) +/// %5 = mul %2, %4 +/// +/// There are four mul shrinking modes: +/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is +/// -128 to 128, and the scalar value range of %4 is also -128 to 128, +/// generate pmullw+sext32 for it (MULS8 mode). +/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is +/// 0 to 255, and the scalar value range of %4 is also 0 to 255, +/// generate pmullw+zext32 for it (MULU8 mode). +/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is +/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767, +/// generate pmullw+pmulhw for it (MULS16 mode). +/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is +/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535, +/// generate pmullw+pmulhuw for it (MULU16 mode). +static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // pmulld is supported since SSE41. It is better to use pmulld + // instead of pmullw+pmulhw. + if (Subtarget.hasSSE41()) + return SDValue(); + + ShrinkMode Mode; + if (!canReduceVMulWidth(N, DAG, Mode)) + return SDValue(); + + SDLoc DL(N); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N->getOperand(0).getValueType(); + unsigned RegSize = 128; + MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16); + EVT ReducedVT = + EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements()); + // Shrink the operands of mul. + SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0); + SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1); + + if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) { + // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the + // lower part is needed. + SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1); + if (Mode == MULU8 || Mode == MULS8) { + return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, + DL, VT, MulLo); + } else { + MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); + // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16, + // the higher part is also needed. + SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL, + ReducedVT, NewN0, NewN1); + + // Repack the lower part and higher part result of mul into a wider + // result. + // Generate shuffle functioning as punpcklwd. + SmallVector ShuffleMask(VT.getVectorNumElements()); + for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) { + ShuffleMask[2 * i] = i; + ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements(); + } + SDValue ResLo = + DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, &ShuffleMask[0]); + ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo); + // Generate shuffle functioning as punpckhwd. + for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) { + ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2; + ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2; + } + SDValue ResHi = + DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, &ShuffleMask[0]); + ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi); + } + } else { + // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want + // to legalize the mul explicitly because implicit legalization for type + // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack + // instructions which will not exist when we explicitly legalize it by + // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with + // <4 x i16> undef). + // + // Legalize the operands of mul. + SmallVector Ops(RegSize / ReducedVT.getSizeInBits(), + DAG.getUNDEF(ReducedVT)); + Ops[0] = NewN0; + NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops); + Ops[0] = NewN1; + NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops); + + if (Mode == MULU8 || Mode == MULS8) { + // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower + // part is needed. + SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1); + + // convert the type of mul result to VT. + MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32); + SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG + : ISD::SIGN_EXTEND_VECTOR_INREG, + DL, ResVT, Mul); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, + DAG.getIntPtrConstant(0, DL)); + } else { + // Generate the lower and higher part of mul: pmulhw/pmulhuw. For + // MULU16/MULS16, both parts are needed. + SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1); + SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL, + OpsVT, NewN0, NewN1); + + // Repack the lower part and higher part result of mul into a wider + // result. Make sure the type of mul result is VT. + MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32); + SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi); + Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, + DAG.getIntPtrConstant(0, DL)); + } + } +} + /// Optimize a single multiply with constant into two operations in order to /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA. static SDValue combineMul(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + EVT VT = N->getValueType(0); + if (DCI.isBeforeLegalize() && VT.isVector()) + return reduceVMULWidth(N, DAG, Subtarget); + // An imul is usually smaller than the alternative sequence. if (DAG.getMachineFunction().getFunction()->optForMinSize()) return SDValue(); @@ -26444,7 +26650,6 @@ if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); - EVT VT = N->getValueType(0); if (VT != MVT::i64 && VT != MVT::i32) return SDValue(); @@ -29590,7 +29795,7 @@ case ISD::ADD: return combineAdd(N, DAG, Subtarget); case ISD::SUB: return combineSub(N, DAG, Subtarget); case X86ISD::ADC: return combineADC(N, DAG, DCI); - case ISD::MUL: return combineMul(N, DAG, DCI); + case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget); case ISD::SHL: case ISD::SRA: case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget); Index: test/CodeGen/X86/shrink_vmul.ll =================================================================== --- test/CodeGen/X86/shrink_vmul.ll +++ test/CodeGen/X86/shrink_vmul.ll @@ -0,0 +1,864 @@ +; NOTE: Assertions have been autogenerated by update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s + +@c = external global i32*, align 8 + +; %val1 = load <2 x i8> +; %op1 = zext<2 x i32> %val1 +; %val2 = load <2 x i8> +; %op2 = zext<2 x i32> %val2 +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { +; CHECK-LABEL: mul_2xi8: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movzwl (%rdi,%rdx), %ecx +; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: movzwl (%rsi,%rdx), %ecx +; CHECK-NEXT: movd %ecx, %xmm1 +; CHECK-NEXT: pxor %xmm2, %xmm2 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; CHECK-NEXT: pmullw %xmm0, %xmm1 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; CHECK-NEXT: movq %xmm1, (%rax,%rdx,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = zext <2 x i8> %wide.load to <2 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <2 x i8>* + %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 + %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <4 x i8> +; %op1 = zext<4 x i32> %val1 +; %val2 = load <4 x i8> +; %op2 = zext<4 x i32> %val2 +; %rst = mul <4 x i32> %op1, %op2 +; +define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { +; CHECK-LABEL: mul_4xi8: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: pxor %xmm2, %xmm2 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; CHECK-NEXT: pmullw %xmm0, %xmm1 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; CHECK-NEXT: movdqu %xmm1, (%rax,%rdx,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <4 x i8>* + %wide.load = load <4 x i8>, <4 x i8>* %tmp7, align 1 + %tmp8 = zext <4 x i8> %wide.load to <4 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <4 x i8>* + %wide.load17 = load <4 x i8>, <4 x i8>* %tmp11, align 1 + %tmp12 = zext <4 x i8> %wide.load17 to <4 x i32> + %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <4 x i32>* + store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <8 x i8> +; %op1 = zext<8 x i32> %val1 +; %val2 = load <8 x i8> +; %op2 = zext<8 x i32> %val2 +; %rst = mul <8 x i32> %op1, %op2 +; +define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { +; CHECK-LABEL: mul_8xi8: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: pxor %xmm2, %xmm2 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; CHECK-NEXT: pmullw %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; CHECK-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) +; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <8 x i8>* + %wide.load = load <8 x i8>, <8 x i8>* %tmp7, align 1 + %tmp8 = zext <8 x i8> %wide.load to <8 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <8 x i8>* + %wide.load17 = load <8 x i8>, <8 x i8>* %tmp11, align 1 + %tmp12 = zext <8 x i8> %wide.load17 to <8 x i32> + %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <8 x i32>* + store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <16 x i8> +; %op1 = zext<16 x i32> %val1 +; %val2 = load <16 x i8> +; %op2 = zext<16 x i32> %val2 +; %rst = mul <16 x i32> %op1, %op2 +; +define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { +; CHECK-LABEL: mul_16xi8: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0 +; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm1 +; CHECK-NEXT: pxor %xmm2, %xmm2 +; CHECK-NEXT: movdqa %xmm0, %xmm3 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; CHECK-NEXT: movdqa %xmm1, %xmm4 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; CHECK-NEXT: pmullw %xmm3, %xmm4 +; CHECK-NEXT: movdqa %xmm4, %xmm3 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; CHECK-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; CHECK-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; CHECK-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; CHECK-NEXT: pmullw %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; CHECK-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) +; CHECK-NEXT: movdqu %xmm0, 32(%rax,%rdx,4) +; CHECK-NEXT: movdqu %xmm4, 16(%rax,%rdx,4) +; CHECK-NEXT: movdqu %xmm3, (%rax,%rdx,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <16 x i8>* + %wide.load = load <16 x i8>, <16 x i8>* %tmp7, align 1 + %tmp8 = zext <16 x i8> %wide.load to <16 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <16 x i8>* + %wide.load17 = load <16 x i8>, <16 x i8>* %tmp11, align 1 + %tmp12 = zext <16 x i8> %wide.load17 to <16 x i32> + %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <16 x i32>* + store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <2 x i16> +; %op1 = zext<2 x i32> %val1 +; %val2 = load <2 x i16> +; %op2 = zext<2 x i32> %val2 +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { +; CHECK-LABEL: mul_2xi16: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: pmulhuw %xmm0, %xmm2 +; CHECK-NEXT: pmullw %xmm0, %xmm1 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; CHECK-NEXT: movq %xmm1, (%rax,%rdx,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 + %tmp8 = zext <2 x i16> %wide.load to <2 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <2 x i16>* + %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 + %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <4 x i16> +; %op1 = zext<4 x i32> %val1 +; %val2 = load <4 x i16> +; %op2 = zext<4 x i32> %val2 +; %rst = mul <4 x i32> %op1, %op2 +; +define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { +; CHECK-LABEL: mul_4xi16: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: pmulhuw %xmm0, %xmm2 +; CHECK-NEXT: pmullw %xmm0, %xmm1 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; CHECK-NEXT: movdqu %xmm1, (%rax,%rdx,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <4 x i16>* + %wide.load = load <4 x i16>, <4 x i16>* %tmp7, align 1 + %tmp8 = zext <4 x i16> %wide.load to <4 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <4 x i16>* + %wide.load17 = load <4 x i16>, <4 x i16>* %tmp11, align 1 + %tmp12 = zext <4 x i16> %wide.load17 to <4 x i32> + %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <4 x i32>* + store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <8 x i16> +; %op1 = zext<8 x i32> %val1 +; %val2 = load <8 x i16> +; %op2 = zext<8 x i32> %val2 +; %rst = mul <8 x i32> %op1, %op2 +; +define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { +; CHECK-LABEL: mul_8xi16: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0 +; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: pmulhuw %xmm0, %xmm2 +; CHECK-NEXT: pmullw %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; CHECK-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) +; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <8 x i16>* + %wide.load = load <8 x i16>, <8 x i16>* %tmp7, align 1 + %tmp8 = zext <8 x i16> %wide.load to <8 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <8 x i16>* + %wide.load17 = load <8 x i16>, <8 x i16>* %tmp11, align 1 + %tmp12 = zext <8 x i16> %wide.load17 to <8 x i32> + %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <8 x i32>* + store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <16 x i16> +; %op1 = zext<16 x i32> %val1 +; %val2 = load <16 x i16> +; %op2 = zext<16 x i32> %val2 +; %rst = mul <16 x i32> %op1, %op2 +; +define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { +; CHECK-LABEL: mul_16xi16: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0 +; CHECK-NEXT: movdqu 16(%rdi,%rdx), %xmm1 +; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm2 +; CHECK-NEXT: movdqu 16(%rsi,%rdx), %xmm3 +; CHECK-NEXT: movdqa %xmm2, %xmm4 +; CHECK-NEXT: pmulhuw %xmm0, %xmm4 +; CHECK-NEXT: pmullw %xmm0, %xmm2 +; CHECK-NEXT: movdqa %xmm2, %xmm0 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; CHECK-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; CHECK-NEXT: movdqa %xmm3, %xmm4 +; CHECK-NEXT: pmulhuw %xmm1, %xmm4 +; CHECK-NEXT: pmullw %xmm1, %xmm3 +; CHECK-NEXT: movdqa %xmm3, %xmm1 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; CHECK-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; CHECK-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) +; CHECK-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) +; CHECK-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) +; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <16 x i16>* + %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1 + %tmp8 = zext <16 x i16> %wide.load to <16 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <16 x i16>* + %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1 + %tmp12 = zext <16 x i16> %wide.load17 to <16 x i32> + %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <16 x i32>* + store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <2 x i8> +; %op1 = sext<2 x i32> %val1 +; %val2 = load <2 x i8> +; %op2 = sext<2 x i32> %val2 +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { +; CHECK-LABEL: mul_2xi8_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movzwl (%rdi,%rdx), %ecx +; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: movzwl (%rsi,%rdx), %ecx +; CHECK-NEXT: movd %ecx, %xmm1 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-NEXT: psraw $8, %xmm0 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-NEXT: psraw $8, %xmm1 +; CHECK-NEXT: pmullw %xmm0, %xmm1 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-NEXT: psrad $16, %xmm0 +; CHECK-NEXT: movq %xmm0, (%rax,%rdx,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = sext <2 x i8> %wide.load to <2 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <2 x i8>* + %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 + %tmp12 = sext <2 x i8> %wide.load17 to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <2 x i8> +; %op1 = sext<2 x i32> %val1 +; %val2 = load <2 x i8> +; %op2 = zext<2 x i32> %val2 +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { +; CHECK-LABEL: mul_2xi8_sext_zext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movzwl (%rdi,%rdx), %ecx +; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: movzwl (%rsi,%rdx), %ecx +; CHECK-NEXT: movd %ecx, %xmm1 +; CHECK-NEXT: pxor %xmm2, %xmm2 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-NEXT: psraw $8, %xmm0 +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: pmulhw %xmm0, %xmm2 +; CHECK-NEXT: pmullw %xmm1, %xmm0 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK-NEXT: movq %xmm0, (%rax,%rdx,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = sext <2 x i8> %wide.load to <2 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <2 x i8>* + %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 + %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <2 x i16> +; %op1 = sext<2 x i32> %val1 +; %val2 = load <2 x i16> +; %op2 = sext<2 x i32> %val2 +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { +; CHECK-LABEL: mul_2xi16_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: pmulhw %xmm0, %xmm2 +; CHECK-NEXT: pmullw %xmm0, %xmm1 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; CHECK-NEXT: movq %xmm1, (%rax,%rdx,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 + %tmp8 = sext <2 x i16> %wide.load to <2 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <2 x i16>* + %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 + %tmp12 = sext <2 x i16> %wide.load17 to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <2 x i16> +; %op1 = sext<2 x i32> %val1 +; %val2 = load <2 x i16> +; %op2 = zext<2 x i32> %val2 +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { +; CHECK-LABEL: mul_2xi16_sext_zext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; CHECK-NEXT: psrad $16, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: pxor %xmm2, %xmm2 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-NEXT: movdqa %xmm0, %xmm3 +; CHECK-NEXT: psrlq $32, %xmm3 +; CHECK-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-NEXT: psllq $32, %xmm3 +; CHECK-NEXT: paddq %xmm2, %xmm3 +; CHECK-NEXT: psrlq $32, %xmm1 +; CHECK-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-NEXT: psllq $32, %xmm1 +; CHECK-NEXT: paddq %xmm3, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; CHECK-NEXT: movq %xmm0, (%rax,%rdx,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 + %tmp8 = sext <2 x i16> %wide.load to <2 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <2 x i16>* + %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 + %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <16 x i16> +; %op1 = sext<16 x i32> %val1 +; %val2 = load <16 x i16> +; %op2 = sext<16 x i32> %val2 +; %rst = mul <16 x i32> %op1, %op2 +; +define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { +; CHECK-LABEL: mul_16xi16_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0 +; CHECK-NEXT: movdqu 16(%rdi,%rdx), %xmm1 +; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm2 +; CHECK-NEXT: movdqu 16(%rsi,%rdx), %xmm3 +; CHECK-NEXT: movdqa %xmm2, %xmm4 +; CHECK-NEXT: pmulhw %xmm0, %xmm4 +; CHECK-NEXT: pmullw %xmm0, %xmm2 +; CHECK-NEXT: movdqa %xmm2, %xmm0 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; CHECK-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; CHECK-NEXT: movdqa %xmm3, %xmm4 +; CHECK-NEXT: pmulhw %xmm1, %xmm4 +; CHECK-NEXT: pmullw %xmm1, %xmm3 +; CHECK-NEXT: movdqa %xmm3, %xmm1 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; CHECK-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; CHECK-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) +; CHECK-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) +; CHECK-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) +; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <16 x i16>* + %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1 + %tmp8 = sext <16 x i16> %wide.load to <16 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <16 x i16>* + %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1 + %tmp12 = sext <16 x i16> %wide.load17 to <16 x i32> + %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <16 x i32>* + store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i8> +; %op1 = zext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 255) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) { +; CHECK-LABEL: mul_2xi8_varconst1: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx +; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; CHECK-NEXT: pmullw {{.*}}(%rip), %xmm0 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = zext <2 x i8> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i8> +; %op1 = sext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 127) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) { +; CHECK-LABEL: mul_2xi8_varconst2: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx +; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-NEXT: psraw $8, %xmm0 +; CHECK-NEXT: pmullw {{.*}}(%rip), %xmm0 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; CHECK-NEXT: psrad $16, %xmm0 +; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = sext <2 x i8> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i8> +; %op1 = zext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 256) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) { +; CHECK-LABEL: mul_2xi8_varconst3: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx +; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u> +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pmulhw %xmm1, %xmm2 +; CHECK-NEXT: pmullw %xmm1, %xmm0 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = zext <2 x i8> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i8> +; %op1 = zext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-1 ~ 255) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) { +; CHECK-LABEL: mul_2xi8_varconst4: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx +; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u> +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pmulhw %xmm1, %xmm2 +; CHECK-NEXT: pmullw %xmm1, %xmm0 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = zext <2 x i8> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i8> +; %op1 = sext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-129 ~ 127) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) { +; CHECK-LABEL: mul_2xi8_varconst5: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx +; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-NEXT: psraw $8, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u> +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pmulhw %xmm1, %xmm2 +; CHECK-NEXT: pmullw %xmm1, %xmm0 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = sext <2 x i8> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i8> +; %op1 = sext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 128) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) { +; CHECK-LABEL: mul_2xi8_varconst6: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx +; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-NEXT: psraw $8, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u> +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pmulhw %xmm1, %xmm2 +; CHECK-NEXT: pmullw %xmm1, %xmm0 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = sext <2 x i8> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i16> +; %op1 = zext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65535) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) { +; CHECK-LABEL: mul_2xi16_varconst1: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u> +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pmulhuw %xmm1, %xmm2 +; CHECK-NEXT: pmullw %xmm1, %xmm0 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 + %tmp8 = zext <2 x i16> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i16> +; %op1 = sext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-32768 ~ 32767) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) { +; CHECK-LABEL: mul_2xi16_varconst2: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u> +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pmulhw %xmm1, %xmm2 +; CHECK-NEXT: pmullw %xmm1, %xmm0 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 + %tmp8 = sext <2 x i16> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i16> +; %op1 = zext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65536) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) { +; CHECK-LABEL: mul_2xi16_varconst3: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; CHECK-NEXT: movl $65536, %ecx # imm = 0x10000 +; CHECK-NEXT: movd %rcx, %xmm1 +; CHECK-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-NEXT: psrlq $32, %xmm0 +; CHECK-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-NEXT: psllq $32, %xmm0 +; CHECK-NEXT: paddq %xmm2, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 + %tmp8 = zext <2 x i16> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i16> +; %op1 = sext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 32768) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) { +; CHECK-LABEL: mul_2xi16_varconst4: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; CHECK-NEXT: psrad $16, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; CHECK-NEXT: movl $32768, %ecx # imm = 0x8000 +; CHECK-NEXT: movd %rcx, %xmm1 +; CHECK-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-NEXT: psrlq $32, %xmm0 +; CHECK-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-NEXT: psllq $32, %xmm0 +; CHECK-NEXT: paddq %xmm2, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) +; CHECK-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 + %tmp8 = sext <2 x i16> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +}