Skip to content

Commit b799a62

Browse files
committedJun 14, 2016
[X86] Reduce the width of multiplification when its operands are extended from i8 or i16
For <N x i32> type mul, pmuludq will be used for targets without SSE41, which often introduces many extra pack and unpack instructions in vectorized loop body because pmuludq generates <N/2 x i64> type value. However when the operands of <N x i32> mul are extended from smaller size values like i8 and i16, the type of mul may be shrunk to use pmullw + pmulhw/pmulhuw instead of pmuludq, which generates better code. For targets with SSE41, pmulld is supported so no shrinking is needed. Differential Revision: http://reviews.llvm.org/D20931 llvm-svn: 272694
1 parent 07c229c commit b799a62

File tree

3 files changed

+1074
-3
lines changed

3 files changed

+1074
-3
lines changed
 

‎llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -670,6 +670,8 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
670670
case ISD::ADD:
671671
case ISD::SUB:
672672
case ISD::MUL:
673+
case ISD::MULHS:
674+
case ISD::MULHU:
673675
case ISD::FADD:
674676
case ISD::FSUB:
675677
case ISD::FMUL:

‎llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 208 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26962,18 +26962,223 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
2696226962
return SDValue();
2696326963
}
2696426964

26965+
/// Different mul shrinking modes.
26966+
enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
26967+
26968+
static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
26969+
EVT VT = N->getOperand(0).getValueType();
26970+
if (VT.getScalarSizeInBits() != 32)
26971+
return false;
26972+
26973+
assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
26974+
unsigned SignBits[2] = {1, 1};
26975+
bool IsPositive[2] = {false, false};
26976+
for (unsigned i = 0; i < 2; i++) {
26977+
SDValue Opd = N->getOperand(i);
26978+
26979+
// DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
26980+
// compute signbits for it separately.
26981+
if (Opd.getOpcode() == ISD::ANY_EXTEND) {
26982+
// For anyextend, it is safe to assume an appropriate number of leading
26983+
// sign/zero bits.
26984+
if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
26985+
SignBits[i] = 25;
26986+
else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
26987+
MVT::i16)
26988+
SignBits[i] = 17;
26989+
else
26990+
return false;
26991+
IsPositive[i] = true;
26992+
} else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
26993+
// All the operands of BUILD_VECTOR need to be int constant.
26994+
// Find the smallest value range which all the operands belong to.
26995+
SignBits[i] = 32;
26996+
IsPositive[i] = true;
26997+
for (const SDValue &SubOp : Opd.getNode()->op_values()) {
26998+
if (SubOp.isUndef())
26999+
continue;
27000+
auto *CN = dyn_cast<ConstantSDNode>(SubOp);
27001+
if (!CN)
27002+
return false;
27003+
APInt IntVal = CN->getAPIntValue();
27004+
if (IntVal.isNegative())
27005+
IsPositive[i] = false;
27006+
SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
27007+
}
27008+
} else {
27009+
SignBits[i] = DAG.ComputeNumSignBits(Opd);
27010+
if (Opd.getOpcode() == ISD::ZERO_EXTEND)
27011+
IsPositive[i] = true;
27012+
}
27013+
}
27014+
27015+
bool AllPositive = IsPositive[0] && IsPositive[1];
27016+
unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
27017+
// When ranges are from -128 ~ 127, use MULS8 mode.
27018+
if (MinSignBits >= 25)
27019+
Mode = MULS8;
27020+
// When ranges are from 0 ~ 255, use MULU8 mode.
27021+
else if (AllPositive && MinSignBits >= 24)
27022+
Mode = MULU8;
27023+
// When ranges are from -32768 ~ 32767, use MULS16 mode.
27024+
else if (MinSignBits >= 17)
27025+
Mode = MULS16;
27026+
// When ranges are from 0 ~ 65535, use MULU16 mode.
27027+
else if (AllPositive && MinSignBits >= 16)
27028+
Mode = MULU16;
27029+
else
27030+
return false;
27031+
return true;
27032+
}
27033+
27034+
/// When the operands of vector mul are extended from smaller size values,
27035+
/// like i8 and i16, the type of mul may be shrinked to generate more
27036+
/// efficient code. Two typical patterns are handled:
27037+
/// Pattern1:
27038+
/// %2 = sext/zext <N x i8> %1 to <N x i32>
27039+
/// %4 = sext/zext <N x i8> %3 to <N x i32>
27040+
// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
27041+
/// %5 = mul <N x i32> %2, %4
27042+
///
27043+
/// Pattern2:
27044+
/// %2 = zext/sext <N x i16> %1 to <N x i32>
27045+
/// %4 = zext/sext <N x i16> %3 to <N x i32>
27046+
/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
27047+
/// %5 = mul <N x i32> %2, %4
27048+
///
27049+
/// There are four mul shrinking modes:
27050+
/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
27051+
/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
27052+
/// generate pmullw+sext32 for it (MULS8 mode).
27053+
/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
27054+
/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
27055+
/// generate pmullw+zext32 for it (MULU8 mode).
27056+
/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
27057+
/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
27058+
/// generate pmullw+pmulhw for it (MULS16 mode).
27059+
/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
27060+
/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
27061+
/// generate pmullw+pmulhuw for it (MULU16 mode).
27062+
static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
27063+
const X86Subtarget &Subtarget) {
27064+
// pmulld is supported since SSE41. It is better to use pmulld
27065+
// instead of pmullw+pmulhw.
27066+
if (Subtarget.hasSSE41())
27067+
return SDValue();
27068+
27069+
ShrinkMode Mode;
27070+
if (!canReduceVMulWidth(N, DAG, Mode))
27071+
return SDValue();
27072+
27073+
SDLoc DL(N);
27074+
SDValue N0 = N->getOperand(0);
27075+
SDValue N1 = N->getOperand(1);
27076+
EVT VT = N->getOperand(0).getValueType();
27077+
unsigned RegSize = 128;
27078+
MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
27079+
EVT ReducedVT =
27080+
EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
27081+
// Shrink the operands of mul.
27082+
SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
27083+
SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
27084+
27085+
if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) {
27086+
// Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
27087+
// lower part is needed.
27088+
SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
27089+
if (Mode == MULU8 || Mode == MULS8) {
27090+
return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
27091+
DL, VT, MulLo);
27092+
} else {
27093+
MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
27094+
// Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
27095+
// the higher part is also needed.
27096+
SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
27097+
ReducedVT, NewN0, NewN1);
27098+
27099+
// Repack the lower part and higher part result of mul into a wider
27100+
// result.
27101+
// Generate shuffle functioning as punpcklwd.
27102+
SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
27103+
for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
27104+
ShuffleMask[2 * i] = i;
27105+
ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
27106+
}
27107+
SDValue ResLo =
27108+
DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, &ShuffleMask[0]);
27109+
ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
27110+
// Generate shuffle functioning as punpckhwd.
27111+
for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
27112+
ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
27113+
ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
27114+
}
27115+
SDValue ResHi =
27116+
DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, &ShuffleMask[0]);
27117+
ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
27118+
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
27119+
}
27120+
} else {
27121+
// When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
27122+
// to legalize the mul explicitly because implicit legalization for type
27123+
// <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
27124+
// instructions which will not exist when we explicitly legalize it by
27125+
// extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
27126+
// <4 x i16> undef).
27127+
//
27128+
// Legalize the operands of mul.
27129+
SmallVector<SDValue, 16> Ops(RegSize / ReducedVT.getSizeInBits(),
27130+
DAG.getUNDEF(ReducedVT));
27131+
Ops[0] = NewN0;
27132+
NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
27133+
Ops[0] = NewN1;
27134+
NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
27135+
27136+
if (Mode == MULU8 || Mode == MULS8) {
27137+
// Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
27138+
// part is needed.
27139+
SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
27140+
27141+
// convert the type of mul result to VT.
27142+
MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
27143+
SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
27144+
: ISD::SIGN_EXTEND_VECTOR_INREG,
27145+
DL, ResVT, Mul);
27146+
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
27147+
DAG.getIntPtrConstant(0, DL));
27148+
} else {
27149+
// Generate the lower and higher part of mul: pmulhw/pmulhuw. For
27150+
// MULU16/MULS16, both parts are needed.
27151+
SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
27152+
SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
27153+
OpsVT, NewN0, NewN1);
27154+
27155+
// Repack the lower part and higher part result of mul into a wider
27156+
// result. Make sure the type of mul result is VT.
27157+
MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
27158+
SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi);
27159+
Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
27160+
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
27161+
DAG.getIntPtrConstant(0, DL));
27162+
}
27163+
}
27164+
}
27165+
2696527166
/// Optimize a single multiply with constant into two operations in order to
2696627167
/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
2696727168
static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
26968-
TargetLowering::DAGCombinerInfo &DCI) {
27169+
TargetLowering::DAGCombinerInfo &DCI,
27170+
const X86Subtarget &Subtarget) {
27171+
EVT VT = N->getValueType(0);
27172+
if (DCI.isBeforeLegalize() && VT.isVector())
27173+
return reduceVMULWidth(N, DAG, Subtarget);
27174+
2696927175
// An imul is usually smaller than the alternative sequence.
2697027176
if (DAG.getMachineFunction().getFunction()->optForMinSize())
2697127177
return SDValue();
2697227178

2697327179
if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
2697427180
return SDValue();
2697527181

26976-
EVT VT = N->getValueType(0);
2697727182
if (VT != MVT::i64 && VT != MVT::i32)
2697827183
return SDValue();
2697927184

@@ -30268,7 +30473,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
3026830473
case ISD::ADD: return combineAdd(N, DAG, Subtarget);
3026930474
case ISD::SUB: return combineSub(N, DAG, Subtarget);
3027030475
case X86ISD::ADC: return combineADC(N, DAG, DCI);
30271-
case ISD::MUL: return combineMul(N, DAG, DCI);
30476+
case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
3027230477
case ISD::SHL:
3027330478
case ISD::SRA:
3027430479
case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);

‎llvm/test/CodeGen/X86/shrink_vmul.ll

Lines changed: 864 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,864 @@
1+
; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s
3+
4+
@c = external global i32*, align 8
5+
6+
; %val1 = load <2 x i8>
7+
; %op1 = zext<2 x i32> %val1
8+
; %val2 = load <2 x i8>
9+
; %op2 = zext<2 x i32> %val2
10+
; %rst = mul <2 x i32> %op1, %op2
11+
;
12+
define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
13+
; CHECK-LABEL: mul_2xi8:
14+
; CHECK: # BB#0: # %entry
15+
; CHECK-NEXT: movq {{.*}}(%rip), %rax
16+
; CHECK-NEXT: movzwl (%rdi,%rdx), %ecx
17+
; CHECK-NEXT: movd %ecx, %xmm0
18+
; CHECK-NEXT: movzwl (%rsi,%rdx), %ecx
19+
; CHECK-NEXT: movd %ecx, %xmm1
20+
; CHECK-NEXT: pxor %xmm2, %xmm2
21+
; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
22+
; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
23+
; CHECK-NEXT: pmullw %xmm0, %xmm1
24+
; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
25+
; CHECK-NEXT: movq %xmm1, (%rax,%rdx,4)
26+
; CHECK-NEXT: retq
27+
entry:
28+
%pre = load i32*, i32** @c
29+
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
30+
%tmp7 = bitcast i8* %tmp6 to <2 x i8>*
31+
%wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
32+
%tmp8 = zext <2 x i8> %wide.load to <2 x i32>
33+
%tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
34+
%tmp11 = bitcast i8* %tmp10 to <2 x i8>*
35+
%wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
36+
%tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
37+
%tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
38+
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
39+
%tmp15 = bitcast i32* %tmp14 to <2 x i32>*
40+
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
41+
ret void
42+
}
43+
44+
; %val1 = load <4 x i8>
45+
; %op1 = zext<4 x i32> %val1
46+
; %val2 = load <4 x i8>
47+
; %op2 = zext<4 x i32> %val2
48+
; %rst = mul <4 x i32> %op1, %op2
49+
;
50+
define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
51+
; CHECK-LABEL: mul_4xi8:
52+
; CHECK: # BB#0: # %entry
53+
; CHECK-NEXT: movq {{.*}}(%rip), %rax
54+
; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
55+
; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
56+
; CHECK-NEXT: pxor %xmm2, %xmm2
57+
; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
58+
; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
59+
; CHECK-NEXT: pmullw %xmm0, %xmm1
60+
; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
61+
; CHECK-NEXT: movdqu %xmm1, (%rax,%rdx,4)
62+
; CHECK-NEXT: retq
63+
entry:
64+
%pre = load i32*, i32** @c
65+
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
66+
%tmp7 = bitcast i8* %tmp6 to <4 x i8>*
67+
%wide.load = load <4 x i8>, <4 x i8>* %tmp7, align 1
68+
%tmp8 = zext <4 x i8> %wide.load to <4 x i32>
69+
%tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
70+
%tmp11 = bitcast i8* %tmp10 to <4 x i8>*
71+
%wide.load17 = load <4 x i8>, <4 x i8>* %tmp11, align 1
72+
%tmp12 = zext <4 x i8> %wide.load17 to <4 x i32>
73+
%tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8
74+
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
75+
%tmp15 = bitcast i32* %tmp14 to <4 x i32>*
76+
store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4
77+
ret void
78+
}
79+
80+
; %val1 = load <8 x i8>
81+
; %op1 = zext<8 x i32> %val1
82+
; %val2 = load <8 x i8>
83+
; %op2 = zext<8 x i32> %val2
84+
; %rst = mul <8 x i32> %op1, %op2
85+
;
86+
define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
87+
; CHECK-LABEL: mul_8xi8:
88+
; CHECK: # BB#0: # %entry
89+
; CHECK-NEXT: movq {{.*}}(%rip), %rax
90+
; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
91+
; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
92+
; CHECK-NEXT: pxor %xmm2, %xmm2
93+
; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
94+
; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
95+
; CHECK-NEXT: pmullw %xmm0, %xmm1
96+
; CHECK-NEXT: movdqa %xmm1, %xmm0
97+
; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
98+
; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
99+
; CHECK-NEXT: movdqu %xmm1, 16(%rax,%rdx,4)
100+
; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4)
101+
; CHECK-NEXT: retq
102+
entry:
103+
%pre = load i32*, i32** @c
104+
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
105+
%tmp7 = bitcast i8* %tmp6 to <8 x i8>*
106+
%wide.load = load <8 x i8>, <8 x i8>* %tmp7, align 1
107+
%tmp8 = zext <8 x i8> %wide.load to <8 x i32>
108+
%tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
109+
%tmp11 = bitcast i8* %tmp10 to <8 x i8>*
110+
%wide.load17 = load <8 x i8>, <8 x i8>* %tmp11, align 1
111+
%tmp12 = zext <8 x i8> %wide.load17 to <8 x i32>
112+
%tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8
113+
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
114+
%tmp15 = bitcast i32* %tmp14 to <8 x i32>*
115+
store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4
116+
ret void
117+
}
118+
119+
; %val1 = load <16 x i8>
120+
; %op1 = zext<16 x i32> %val1
121+
; %val2 = load <16 x i8>
122+
; %op2 = zext<16 x i32> %val2
123+
; %rst = mul <16 x i32> %op1, %op2
124+
;
125+
define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
126+
; CHECK-LABEL: mul_16xi8:
127+
; CHECK: # BB#0: # %entry
128+
; CHECK-NEXT: movq {{.*}}(%rip), %rax
129+
; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0
130+
; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm1
131+
; CHECK-NEXT: pxor %xmm2, %xmm2
132+
; CHECK-NEXT: movdqa %xmm0, %xmm3
133+
; CHECK-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
134+
; CHECK-NEXT: movdqa %xmm1, %xmm4
135+
; CHECK-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
136+
; CHECK-NEXT: pmullw %xmm3, %xmm4
137+
; CHECK-NEXT: movdqa %xmm4, %xmm3
138+
; CHECK-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
139+
; CHECK-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
140+
; CHECK-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
141+
; CHECK-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
142+
; CHECK-NEXT: pmullw %xmm0, %xmm1
143+
; CHECK-NEXT: movdqa %xmm1, %xmm0
144+
; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
145+
; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
146+
; CHECK-NEXT: movdqu %xmm1, 48(%rax,%rdx,4)
147+
; CHECK-NEXT: movdqu %xmm0, 32(%rax,%rdx,4)
148+
; CHECK-NEXT: movdqu %xmm4, 16(%rax,%rdx,4)
149+
; CHECK-NEXT: movdqu %xmm3, (%rax,%rdx,4)
150+
; CHECK-NEXT: retq
151+
entry:
152+
%pre = load i32*, i32** @c
153+
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
154+
%tmp7 = bitcast i8* %tmp6 to <16 x i8>*
155+
%wide.load = load <16 x i8>, <16 x i8>* %tmp7, align 1
156+
%tmp8 = zext <16 x i8> %wide.load to <16 x i32>
157+
%tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
158+
%tmp11 = bitcast i8* %tmp10 to <16 x i8>*
159+
%wide.load17 = load <16 x i8>, <16 x i8>* %tmp11, align 1
160+
%tmp12 = zext <16 x i8> %wide.load17 to <16 x i32>
161+
%tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
162+
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
163+
%tmp15 = bitcast i32* %tmp14 to <16 x i32>*
164+
store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
165+
ret void
166+
}
167+
168+
; %val1 = load <2 x i16>
169+
; %op1 = zext<2 x i32> %val1
170+
; %val2 = load <2 x i16>
171+
; %op2 = zext<2 x i32> %val2
172+
; %rst = mul <2 x i32> %op1, %op2
173+
;
174+
define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
175+
; CHECK-LABEL: mul_2xi16:
176+
; CHECK: # BB#0: # %entry
177+
; CHECK-NEXT: movq {{.*}}(%rip), %rax
178+
; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
179+
; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
180+
; CHECK-NEXT: movdqa %xmm1, %xmm2
181+
; CHECK-NEXT: pmulhuw %xmm0, %xmm2
182+
; CHECK-NEXT: pmullw %xmm0, %xmm1
183+
; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
184+
; CHECK-NEXT: movq %xmm1, (%rax,%rdx,4)
185+
; CHECK-NEXT: retq
186+
entry:
187+
%pre = load i32*, i32** @c
188+
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
189+
%tmp7 = bitcast i8* %tmp6 to <2 x i16>*
190+
%wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
191+
%tmp8 = zext <2 x i16> %wide.load to <2 x i32>
192+
%tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
193+
%tmp11 = bitcast i8* %tmp10 to <2 x i16>*
194+
%wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
195+
%tmp12 = zext <2 x i16> %wide.load17 to <2 x i32>
196+
%tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
197+
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
198+
%tmp15 = bitcast i32* %tmp14 to <2 x i32>*
199+
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
200+
ret void
201+
}
202+
203+
; %val1 = load <4 x i16>
204+
; %op1 = zext<4 x i32> %val1
205+
; %val2 = load <4 x i16>
206+
; %op2 = zext<4 x i32> %val2
207+
; %rst = mul <4 x i32> %op1, %op2
208+
;
209+
define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
210+
; CHECK-LABEL: mul_4xi16:
211+
; CHECK: # BB#0: # %entry
212+
; CHECK-NEXT: movq {{.*}}(%rip), %rax
213+
; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
214+
; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
215+
; CHECK-NEXT: movdqa %xmm1, %xmm2
216+
; CHECK-NEXT: pmulhuw %xmm0, %xmm2
217+
; CHECK-NEXT: pmullw %xmm0, %xmm1
218+
; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
219+
; CHECK-NEXT: movdqu %xmm1, (%rax,%rdx,4)
220+
; CHECK-NEXT: retq
221+
entry:
222+
%pre = load i32*, i32** @c
223+
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
224+
%tmp7 = bitcast i8* %tmp6 to <4 x i16>*
225+
%wide.load = load <4 x i16>, <4 x i16>* %tmp7, align 1
226+
%tmp8 = zext <4 x i16> %wide.load to <4 x i32>
227+
%tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
228+
%tmp11 = bitcast i8* %tmp10 to <4 x i16>*
229+
%wide.load17 = load <4 x i16>, <4 x i16>* %tmp11, align 1
230+
%tmp12 = zext <4 x i16> %wide.load17 to <4 x i32>
231+
%tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8
232+
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
233+
%tmp15 = bitcast i32* %tmp14 to <4 x i32>*
234+
store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4
235+
ret void
236+
}
237+
238+
; %val1 = load <8 x i16>
239+
; %op1 = zext<8 x i32> %val1
240+
; %val2 = load <8 x i16>
241+
; %op2 = zext<8 x i32> %val2
242+
; %rst = mul <8 x i32> %op1, %op2
243+
;
244+
define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
245+
; CHECK-LABEL: mul_8xi16:
246+
; CHECK: # BB#0: # %entry
247+
; CHECK-NEXT: movq {{.*}}(%rip), %rax
248+
; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0
249+
; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm1
250+
; CHECK-NEXT: movdqa %xmm1, %xmm2
251+
; CHECK-NEXT: pmulhuw %xmm0, %xmm2
252+
; CHECK-NEXT: pmullw %xmm0, %xmm1
253+
; CHECK-NEXT: movdqa %xmm1, %xmm0
254+
; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
255+
; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
256+
; CHECK-NEXT: movdqu %xmm1, 16(%rax,%rdx,4)
257+
; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4)
258+
; CHECK-NEXT: retq
259+
entry:
260+
%pre = load i32*, i32** @c
261+
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
262+
%tmp7 = bitcast i8* %tmp6 to <8 x i16>*
263+
%wide.load = load <8 x i16>, <8 x i16>* %tmp7, align 1
264+
%tmp8 = zext <8 x i16> %wide.load to <8 x i32>
265+
%tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
266+
%tmp11 = bitcast i8* %tmp10 to <8 x i16>*
267+
%wide.load17 = load <8 x i16>, <8 x i16>* %tmp11, align 1
268+
%tmp12 = zext <8 x i16> %wide.load17 to <8 x i32>
269+
%tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8
270+
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
271+
%tmp15 = bitcast i32* %tmp14 to <8 x i32>*
272+
store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4
273+
ret void
274+
}
275+
276+
; %val1 = load <16 x i16>
277+
; %op1 = zext<16 x i32> %val1
278+
; %val2 = load <16 x i16>
279+
; %op2 = zext<16 x i32> %val2
280+
; %rst = mul <16 x i32> %op1, %op2
281+
;
282+
define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
283+
; CHECK-LABEL: mul_16xi16:
284+
; CHECK: # BB#0: # %entry
285+
; CHECK-NEXT: movq {{.*}}(%rip), %rax
286+
; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0
287+
; CHECK-NEXT: movdqu 16(%rdi,%rdx), %xmm1
288+
; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm2
289+
; CHECK-NEXT: movdqu 16(%rsi,%rdx), %xmm3
290+
; CHECK-NEXT: movdqa %xmm2, %xmm4
291+
; CHECK-NEXT: pmulhuw %xmm0, %xmm4
292+
; CHECK-NEXT: pmullw %xmm0, %xmm2
293+
; CHECK-NEXT: movdqa %xmm2, %xmm0
294+
; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
295+
; CHECK-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
296+
; CHECK-NEXT: movdqa %xmm3, %xmm4
297+
; CHECK-NEXT: pmulhuw %xmm1, %xmm4
298+
; CHECK-NEXT: pmullw %xmm1, %xmm3
299+
; CHECK-NEXT: movdqa %xmm3, %xmm1
300+
; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
301+
; CHECK-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
302+
; CHECK-NEXT: movdqu %xmm3, 48(%rax,%rdx,4)
303+
; CHECK-NEXT: movdqu %xmm1, 32(%rax,%rdx,4)
304+
; CHECK-NEXT: movdqu %xmm2, 16(%rax,%rdx,4)
305+
; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4)
306+
; CHECK-NEXT: retq
307+
entry:
308+
%pre = load i32*, i32** @c
309+
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
310+
%tmp7 = bitcast i8* %tmp6 to <16 x i16>*
311+
%wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1
312+
%tmp8 = zext <16 x i16> %wide.load to <16 x i32>
313+
%tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
314+
%tmp11 = bitcast i8* %tmp10 to <16 x i16>*
315+
%wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1
316+
%tmp12 = zext <16 x i16> %wide.load17 to <16 x i32>
317+
%tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
318+
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
319+
%tmp15 = bitcast i32* %tmp14 to <16 x i32>*
320+
store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
321+
ret void
322+
}
323+
324+
; %val1 = load <2 x i8>
325+
; %op1 = sext<2 x i32> %val1
326+
; %val2 = load <2 x i8>
327+
; %op2 = sext<2 x i32> %val2
328+
; %rst = mul <2 x i32> %op1, %op2
329+
;
330+
define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
331+
; CHECK-LABEL: mul_2xi8_sext:
332+
; CHECK: # BB#0: # %entry
333+
; CHECK-NEXT: movq {{.*}}(%rip), %rax
334+
; CHECK-NEXT: movzwl (%rdi,%rdx), %ecx
335+
; CHECK-NEXT: movd %ecx, %xmm0
336+
; CHECK-NEXT: movzwl (%rsi,%rdx), %ecx
337+
; CHECK-NEXT: movd %ecx, %xmm1
338+
; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
339+
; CHECK-NEXT: psraw $8, %xmm0
340+
; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
341+
; CHECK-NEXT: psraw $8, %xmm1
342+
; CHECK-NEXT: pmullw %xmm0, %xmm1
343+
; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
344+
; CHECK-NEXT: psrad $16, %xmm0
345+
; CHECK-NEXT: movq %xmm0, (%rax,%rdx,4)
346+
; CHECK-NEXT: retq
347+
entry:
348+
%pre = load i32*, i32** @c
349+
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
350+
%tmp7 = bitcast i8* %tmp6 to <2 x i8>*
351+
%wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
352+
%tmp8 = sext <2 x i8> %wide.load to <2 x i32>
353+
%tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
354+
%tmp11 = bitcast i8* %tmp10 to <2 x i8>*
355+
%wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
356+
%tmp12 = sext <2 x i8> %wide.load17 to <2 x i32>
357+
%tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
358+
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
359+
%tmp15 = bitcast i32* %tmp14 to <2 x i32>*
360+
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
361+
ret void
362+
}
363+
364+
; %val1 = load <2 x i8>
365+
; %op1 = sext<2 x i32> %val1
366+
; %val2 = load <2 x i8>
367+
; %op2 = zext<2 x i32> %val2
368+
; %rst = mul <2 x i32> %op1, %op2
369+
;
370+
define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
371+
; CHECK-LABEL: mul_2xi8_sext_zext:
372+
; CHECK: # BB#0: # %entry
373+
; CHECK-NEXT: movq {{.*}}(%rip), %rax
374+
; CHECK-NEXT: movzwl (%rdi,%rdx), %ecx
375+
; CHECK-NEXT: movd %ecx, %xmm0
376+
; CHECK-NEXT: movzwl (%rsi,%rdx), %ecx
377+
; CHECK-NEXT: movd %ecx, %xmm1
378+
; CHECK-NEXT: pxor %xmm2, %xmm2
379+
; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
380+
; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
381+
; CHECK-NEXT: psraw $8, %xmm0
382+
; CHECK-NEXT: movdqa %xmm1, %xmm2
383+
; CHECK-NEXT: pmulhw %xmm0, %xmm2
384+
; CHECK-NEXT: pmullw %xmm1, %xmm0
385+
; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
386+
; CHECK-NEXT: movq %xmm0, (%rax,%rdx,4)
387+
; CHECK-NEXT: retq
388+
entry:
389+
%pre = load i32*, i32** @c
390+
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
391+
%tmp7 = bitcast i8* %tmp6 to <2 x i8>*
392+
%wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
393+
%tmp8 = sext <2 x i8> %wide.load to <2 x i32>
394+
%tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
395+
%tmp11 = bitcast i8* %tmp10 to <2 x i8>*
396+
%wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
397+
%tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
398+
%tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
399+
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
400+
%tmp15 = bitcast i32* %tmp14 to <2 x i32>*
401+
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
402+
ret void
403+
}
404+
405+
; %val1 = load <2 x i16>
406+
; %op1 = sext<2 x i32> %val1
407+
; %val2 = load <2 x i16>
408+
; %op2 = sext<2 x i32> %val2
409+
; %rst = mul <2 x i32> %op1, %op2
410+
;
411+
define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
412+
; CHECK-LABEL: mul_2xi16_sext:
413+
; CHECK: # BB#0: # %entry
414+
; CHECK-NEXT: movq {{.*}}(%rip), %rax
415+
; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
416+
; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
417+
; CHECK-NEXT: movdqa %xmm1, %xmm2
418+
; CHECK-NEXT: pmulhw %xmm0, %xmm2
419+
; CHECK-NEXT: pmullw %xmm0, %xmm1
420+
; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
421+
; CHECK-NEXT: movq %xmm1, (%rax,%rdx,4)
422+
; CHECK-NEXT: retq
423+
entry:
424+
%pre = load i32*, i32** @c
425+
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
426+
%tmp7 = bitcast i8* %tmp6 to <2 x i16>*
427+
%wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
428+
%tmp8 = sext <2 x i16> %wide.load to <2 x i32>
429+
%tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
430+
%tmp11 = bitcast i8* %tmp10 to <2 x i16>*
431+
%wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
432+
%tmp12 = sext <2 x i16> %wide.load17 to <2 x i32>
433+
%tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
434+
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
435+
%tmp15 = bitcast i32* %tmp14 to <2 x i32>*
436+
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
437+
ret void
438+
}
439+
440+
; %val1 = load <2 x i16>
441+
; %op1 = sext<2 x i32> %val1
442+
; %val2 = load <2 x i16>
443+
; %op2 = zext<2 x i32> %val2
444+
; %rst = mul <2 x i32> %op1, %op2
445+
;
446+
define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
447+
; CHECK-LABEL: mul_2xi16_sext_zext:
448+
; CHECK: # BB#0: # %entry
449+
; CHECK-NEXT: movq {{.*}}(%rip), %rax
450+
; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
451+
; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
452+
; CHECK-NEXT: psrad $16, %xmm0
453+
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
454+
; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
455+
; CHECK-NEXT: pxor %xmm2, %xmm2
456+
; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
457+
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
458+
; CHECK-NEXT: movdqa %xmm1, %xmm2
459+
; CHECK-NEXT: pmuludq %xmm0, %xmm2
460+
; CHECK-NEXT: movdqa %xmm0, %xmm3
461+
; CHECK-NEXT: psrlq $32, %xmm3
462+
; CHECK-NEXT: pmuludq %xmm1, %xmm3
463+
; CHECK-NEXT: psllq $32, %xmm3
464+
; CHECK-NEXT: paddq %xmm2, %xmm3
465+
; CHECK-NEXT: psrlq $32, %xmm1
466+
; CHECK-NEXT: pmuludq %xmm0, %xmm1
467+
; CHECK-NEXT: psllq $32, %xmm1
468+
; CHECK-NEXT: paddq %xmm3, %xmm1
469+
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
470+
; CHECK-NEXT: movq %xmm0, (%rax,%rdx,4)
471+
; CHECK-NEXT: retq
472+
entry:
473+
%pre = load i32*, i32** @c
474+
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
475+
%tmp7 = bitcast i8* %tmp6 to <2 x i16>*
476+
%wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
477+
%tmp8 = sext <2 x i16> %wide.load to <2 x i32>
478+
%tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
479+
%tmp11 = bitcast i8* %tmp10 to <2 x i16>*
480+
%wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
481+
%tmp12 = zext <2 x i16> %wide.load17 to <2 x i32>
482+
%tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
483+
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
484+
%tmp15 = bitcast i32* %tmp14 to <2 x i32>*
485+
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
486+
ret void
487+
}
488+
489+
; %val1 = load <16 x i16>
490+
; %op1 = sext<16 x i32> %val1
491+
; %val2 = load <16 x i16>
492+
; %op2 = sext<16 x i32> %val2
493+
; %rst = mul <16 x i32> %op1, %op2
494+
;
495+
define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
496+
; CHECK-LABEL: mul_16xi16_sext:
497+
; CHECK: # BB#0: # %entry
498+
; CHECK-NEXT: movq {{.*}}(%rip), %rax
499+
; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0
500+
; CHECK-NEXT: movdqu 16(%rdi,%rdx), %xmm1
501+
; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm2
502+
; CHECK-NEXT: movdqu 16(%rsi,%rdx), %xmm3
503+
; CHECK-NEXT: movdqa %xmm2, %xmm4
504+
; CHECK-NEXT: pmulhw %xmm0, %xmm4
505+
; CHECK-NEXT: pmullw %xmm0, %xmm2
506+
; CHECK-NEXT: movdqa %xmm2, %xmm0
507+
; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
508+
; CHECK-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
509+
; CHECK-NEXT: movdqa %xmm3, %xmm4
510+
; CHECK-NEXT: pmulhw %xmm1, %xmm4
511+
; CHECK-NEXT: pmullw %xmm1, %xmm3
512+
; CHECK-NEXT: movdqa %xmm3, %xmm1
513+
; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
514+
; CHECK-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
515+
; CHECK-NEXT: movdqu %xmm3, 48(%rax,%rdx,4)
516+
; CHECK-NEXT: movdqu %xmm1, 32(%rax,%rdx,4)
517+
; CHECK-NEXT: movdqu %xmm2, 16(%rax,%rdx,4)
518+
; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4)
519+
; CHECK-NEXT: retq
520+
entry:
521+
%pre = load i32*, i32** @c
522+
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
523+
%tmp7 = bitcast i8* %tmp6 to <16 x i16>*
524+
%wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1
525+
%tmp8 = sext <16 x i16> %wide.load to <16 x i32>
526+
%tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
527+
%tmp11 = bitcast i8* %tmp10 to <16 x i16>*
528+
%wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1
529+
%tmp12 = sext <16 x i16> %wide.load17 to <16 x i32>
530+
%tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
531+
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
532+
%tmp15 = bitcast i32* %tmp14 to <16 x i32>*
533+
store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
534+
ret void
535+
}
536+
537+
; %val = load <2 x i8>
538+
; %op1 = zext<2 x i32> %val
539+
; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 255)
540+
; %rst = mul <2 x i32> %op1, %op2
541+
;
542+
define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) {
543+
; CHECK-LABEL: mul_2xi8_varconst1:
544+
; CHECK: # BB#0: # %entry
545+
; CHECK-NEXT: movq {{.*}}(%rip), %rax
546+
; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx
547+
; CHECK-NEXT: movd %ecx, %xmm0
548+
; CHECK-NEXT: pxor %xmm1, %xmm1
549+
; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
550+
; CHECK-NEXT: pmullw {{.*}}(%rip), %xmm0
551+
; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
552+
; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
553+
; CHECK-NEXT: retq
554+
entry:
555+
%pre = load i32*, i32** @c
556+
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
557+
%tmp7 = bitcast i8* %tmp6 to <2 x i8>*
558+
%wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
559+
%tmp8 = zext <2 x i8> %wide.load to <2 x i32>
560+
%tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 255>
561+
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
562+
%tmp15 = bitcast i32* %tmp14 to <2 x i32>*
563+
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
564+
ret void
565+
}
566+
567+
; %val = load <2 x i8>
568+
; %op1 = sext<2 x i32> %val
569+
; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 127)
570+
; %rst = mul <2 x i32> %op1, %op2
571+
;
572+
define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) {
573+
; CHECK-LABEL: mul_2xi8_varconst2:
574+
; CHECK: # BB#0: # %entry
575+
; CHECK-NEXT: movq {{.*}}(%rip), %rax
576+
; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx
577+
; CHECK-NEXT: movd %ecx, %xmm0
578+
; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
579+
; CHECK-NEXT: psraw $8, %xmm0
580+
; CHECK-NEXT: pmullw {{.*}}(%rip), %xmm0
581+
; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
582+
; CHECK-NEXT: psrad $16, %xmm0
583+
; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
584+
; CHECK-NEXT: retq
585+
entry:
586+
%pre = load i32*, i32** @c
587+
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
588+
%tmp7 = bitcast i8* %tmp6 to <2 x i8>*
589+
%wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
590+
%tmp8 = sext <2 x i8> %wide.load to <2 x i32>
591+
%tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 127>
592+
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
593+
%tmp15 = bitcast i32* %tmp14 to <2 x i32>*
594+
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
595+
ret void
596+
}
597+
598+
; %val = load <2 x i8>
599+
; %op1 = zext<2 x i32> %val
600+
; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 256)
601+
; %rst = mul <2 x i32> %op1, %op2
602+
;
603+
define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) {
604+
; CHECK-LABEL: mul_2xi8_varconst3:
605+
; CHECK: # BB#0: # %entry
606+
; CHECK-NEXT: movq {{.*}}(%rip), %rax
607+
; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx
608+
; CHECK-NEXT: movd %ecx, %xmm0
609+
; CHECK-NEXT: pxor %xmm1, %xmm1
610+
; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
611+
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u>
612+
; CHECK-NEXT: movdqa %xmm0, %xmm2
613+
; CHECK-NEXT: pmulhw %xmm1, %xmm2
614+
; CHECK-NEXT: pmullw %xmm1, %xmm0
615+
; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
616+
; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
617+
; CHECK-NEXT: retq
618+
entry:
619+
%pre = load i32*, i32** @c
620+
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
621+
%tmp7 = bitcast i8* %tmp6 to <2 x i8>*
622+
%wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
623+
%tmp8 = zext <2 x i8> %wide.load to <2 x i32>
624+
%tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 256>
625+
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
626+
%tmp15 = bitcast i32* %tmp14 to <2 x i32>*
627+
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
628+
ret void
629+
}
630+
631+
; %val = load <2 x i8>
632+
; %op1 = zext<2 x i32> %val
633+
; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-1 ~ 255)
634+
; %rst = mul <2 x i32> %op1, %op2
635+
;
636+
define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) {
637+
; CHECK-LABEL: mul_2xi8_varconst4:
638+
; CHECK: # BB#0: # %entry
639+
; CHECK-NEXT: movq {{.*}}(%rip), %rax
640+
; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx
641+
; CHECK-NEXT: movd %ecx, %xmm0
642+
; CHECK-NEXT: pxor %xmm1, %xmm1
643+
; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
644+
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
645+
; CHECK-NEXT: movdqa %xmm0, %xmm2
646+
; CHECK-NEXT: pmulhw %xmm1, %xmm2
647+
; CHECK-NEXT: pmullw %xmm1, %xmm0
648+
; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
649+
; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
650+
; CHECK-NEXT: retq
651+
entry:
652+
%pre = load i32*, i32** @c
653+
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
654+
%tmp7 = bitcast i8* %tmp6 to <2 x i8>*
655+
%wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
656+
%tmp8 = zext <2 x i8> %wide.load to <2 x i32>
657+
%tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -1, i32 255>
658+
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
659+
%tmp15 = bitcast i32* %tmp14 to <2 x i32>*
660+
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
661+
ret void
662+
}
663+
664+
; %val = load <2 x i8>
665+
; %op1 = sext<2 x i32> %val
666+
; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-129 ~ 127)
667+
; %rst = mul <2 x i32> %op1, %op2
668+
;
669+
define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) {
670+
; CHECK-LABEL: mul_2xi8_varconst5:
671+
; CHECK: # BB#0: # %entry
672+
; CHECK-NEXT: movq {{.*}}(%rip), %rax
673+
; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx
674+
; CHECK-NEXT: movd %ecx, %xmm0
675+
; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
676+
; CHECK-NEXT: psraw $8, %xmm0
677+
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u>
678+
; CHECK-NEXT: movdqa %xmm0, %xmm2
679+
; CHECK-NEXT: pmulhw %xmm1, %xmm2
680+
; CHECK-NEXT: pmullw %xmm1, %xmm0
681+
; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
682+
; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
683+
; CHECK-NEXT: retq
684+
entry:
685+
%pre = load i32*, i32** @c
686+
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
687+
%tmp7 = bitcast i8* %tmp6 to <2 x i8>*
688+
%wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
689+
%tmp8 = sext <2 x i8> %wide.load to <2 x i32>
690+
%tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -129, i32 127>
691+
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
692+
%tmp15 = bitcast i32* %tmp14 to <2 x i32>*
693+
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
694+
ret void
695+
}
696+
697+
; %val = load <2 x i8>
698+
; %op1 = sext<2 x i32> %val
699+
; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 128)
700+
; %rst = mul <2 x i32> %op1, %op2
701+
;
702+
define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) {
703+
; CHECK-LABEL: mul_2xi8_varconst6:
704+
; CHECK: # BB#0: # %entry
705+
; CHECK-NEXT: movq {{.*}}(%rip), %rax
706+
; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx
707+
; CHECK-NEXT: movd %ecx, %xmm0
708+
; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
709+
; CHECK-NEXT: psraw $8, %xmm0
710+
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u>
711+
; CHECK-NEXT: movdqa %xmm0, %xmm2
712+
; CHECK-NEXT: pmulhw %xmm1, %xmm2
713+
; CHECK-NEXT: pmullw %xmm1, %xmm0
714+
; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
715+
; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
716+
; CHECK-NEXT: retq
717+
entry:
718+
%pre = load i32*, i32** @c
719+
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
720+
%tmp7 = bitcast i8* %tmp6 to <2 x i8>*
721+
%wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
722+
%tmp8 = sext <2 x i8> %wide.load to <2 x i32>
723+
%tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 128>
724+
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
725+
%tmp15 = bitcast i32* %tmp14 to <2 x i32>*
726+
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
727+
ret void
728+
}
729+
730+
; %val = load <2 x i16>
731+
; %op1 = zext<2 x i32> %val
732+
; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65535)
733+
; %rst = mul <2 x i32> %op1, %op2
734+
;
735+
define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) {
736+
; CHECK-LABEL: mul_2xi16_varconst1:
737+
; CHECK: # BB#0: # %entry
738+
; CHECK-NEXT: movq {{.*}}(%rip), %rax
739+
; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
740+
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
741+
; CHECK-NEXT: movdqa %xmm0, %xmm2
742+
; CHECK-NEXT: pmulhuw %xmm1, %xmm2
743+
; CHECK-NEXT: pmullw %xmm1, %xmm0
744+
; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
745+
; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
746+
; CHECK-NEXT: retq
747+
entry:
748+
%pre = load i32*, i32** @c
749+
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
750+
%tmp7 = bitcast i8* %tmp6 to <2 x i16>*
751+
%wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
752+
%tmp8 = zext <2 x i16> %wide.load to <2 x i32>
753+
%tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65535>
754+
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
755+
%tmp15 = bitcast i32* %tmp14 to <2 x i32>*
756+
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
757+
ret void
758+
}
759+
760+
; %val = load <2 x i16>
761+
; %op1 = sext<2 x i32> %val
762+
; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-32768 ~ 32767)
763+
; %rst = mul <2 x i32> %op1, %op2
764+
;
765+
define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) {
766+
; CHECK-LABEL: mul_2xi16_varconst2:
767+
; CHECK: # BB#0: # %entry
768+
; CHECK-NEXT: movq {{.*}}(%rip), %rax
769+
; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
770+
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u>
771+
; CHECK-NEXT: movdqa %xmm0, %xmm2
772+
; CHECK-NEXT: pmulhw %xmm1, %xmm2
773+
; CHECK-NEXT: pmullw %xmm1, %xmm0
774+
; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
775+
; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
776+
; CHECK-NEXT: retq
777+
entry:
778+
%pre = load i32*, i32** @c
779+
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
780+
%tmp7 = bitcast i8* %tmp6 to <2 x i16>*
781+
%wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
782+
%tmp8 = sext <2 x i16> %wide.load to <2 x i32>
783+
%tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -32768, i32 32767>
784+
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
785+
%tmp15 = bitcast i32* %tmp14 to <2 x i32>*
786+
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
787+
ret void
788+
}
789+
790+
; %val = load <2 x i16>
791+
; %op1 = zext<2 x i32> %val
792+
; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65536)
793+
; %rst = mul <2 x i32> %op1, %op2
794+
;
795+
define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) {
796+
; CHECK-LABEL: mul_2xi16_varconst3:
797+
; CHECK: # BB#0: # %entry
798+
; CHECK-NEXT: movq {{.*}}(%rip), %rax
799+
; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
800+
; CHECK-NEXT: pxor %xmm1, %xmm1
801+
; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
802+
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
803+
; CHECK-NEXT: movl $65536, %ecx # imm = 0x10000
804+
; CHECK-NEXT: movd %rcx, %xmm1
805+
; CHECK-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
806+
; CHECK-NEXT: movdqa %xmm0, %xmm2
807+
; CHECK-NEXT: pmuludq %xmm1, %xmm2
808+
; CHECK-NEXT: psrlq $32, %xmm0
809+
; CHECK-NEXT: pmuludq %xmm1, %xmm0
810+
; CHECK-NEXT: psllq $32, %xmm0
811+
; CHECK-NEXT: paddq %xmm2, %xmm0
812+
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
813+
; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
814+
; CHECK-NEXT: retq
815+
entry:
816+
%pre = load i32*, i32** @c
817+
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
818+
%tmp7 = bitcast i8* %tmp6 to <2 x i16>*
819+
%wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
820+
%tmp8 = zext <2 x i16> %wide.load to <2 x i32>
821+
%tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65536>
822+
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
823+
%tmp15 = bitcast i32* %tmp14 to <2 x i32>*
824+
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
825+
ret void
826+
}
827+
828+
; %val = load <2 x i16>
829+
; %op1 = sext<2 x i32> %val
830+
; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 32768)
831+
; %rst = mul <2 x i32> %op1, %op2
832+
;
833+
define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) {
834+
; CHECK-LABEL: mul_2xi16_varconst4:
835+
; CHECK: # BB#0: # %entry
836+
; CHECK-NEXT: movq {{.*}}(%rip), %rax
837+
; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
838+
; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
839+
; CHECK-NEXT: psrad $16, %xmm0
840+
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
841+
; CHECK-NEXT: movl $32768, %ecx # imm = 0x8000
842+
; CHECK-NEXT: movd %rcx, %xmm1
843+
; CHECK-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
844+
; CHECK-NEXT: movdqa %xmm0, %xmm2
845+
; CHECK-NEXT: pmuludq %xmm1, %xmm2
846+
; CHECK-NEXT: psrlq $32, %xmm0
847+
; CHECK-NEXT: pmuludq %xmm1, %xmm0
848+
; CHECK-NEXT: psllq $32, %xmm0
849+
; CHECK-NEXT: paddq %xmm2, %xmm0
850+
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
851+
; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
852+
; CHECK-NEXT: retq
853+
entry:
854+
%pre = load i32*, i32** @c
855+
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
856+
%tmp7 = bitcast i8* %tmp6 to <2 x i16>*
857+
%wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
858+
%tmp8 = sext <2 x i16> %wide.load to <2 x i32>
859+
%tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 32768>
860+
%tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
861+
%tmp15 = bitcast i32* %tmp14 to <2 x i32>*
862+
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
863+
ret void
864+
}

0 commit comments

Comments
 (0)
Please sign in to comment.