|
| 1 | +; NOTE: Assertions have been autogenerated by update_llc_test_checks.py |
| 2 | +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s |
| 3 | + |
| 4 | +@c = external global i32*, align 8 |
| 5 | + |
| 6 | +; %val1 = load <2 x i8> |
| 7 | +; %op1 = zext<2 x i32> %val1 |
| 8 | +; %val2 = load <2 x i8> |
| 9 | +; %op2 = zext<2 x i32> %val2 |
| 10 | +; %rst = mul <2 x i32> %op1, %op2 |
| 11 | +; |
| 12 | +define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { |
| 13 | +; CHECK-LABEL: mul_2xi8: |
| 14 | +; CHECK: # BB#0: # %entry |
| 15 | +; CHECK-NEXT: movq {{.*}}(%rip), %rax |
| 16 | +; CHECK-NEXT: movzwl (%rdi,%rdx), %ecx |
| 17 | +; CHECK-NEXT: movd %ecx, %xmm0 |
| 18 | +; CHECK-NEXT: movzwl (%rsi,%rdx), %ecx |
| 19 | +; CHECK-NEXT: movd %ecx, %xmm1 |
| 20 | +; CHECK-NEXT: pxor %xmm2, %xmm2 |
| 21 | +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] |
| 22 | +; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] |
| 23 | +; CHECK-NEXT: pmullw %xmm0, %xmm1 |
| 24 | +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] |
| 25 | +; CHECK-NEXT: movq %xmm1, (%rax,%rdx,4) |
| 26 | +; CHECK-NEXT: retq |
| 27 | +entry: |
| 28 | + %pre = load i32*, i32** @c |
| 29 | + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index |
| 30 | + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* |
| 31 | + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 |
| 32 | + %tmp8 = zext <2 x i8> %wide.load to <2 x i32> |
| 33 | + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index |
| 34 | + %tmp11 = bitcast i8* %tmp10 to <2 x i8>* |
| 35 | + %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 |
| 36 | + %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32> |
| 37 | + %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 |
| 38 | + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index |
| 39 | + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* |
| 40 | + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 |
| 41 | + ret void |
| 42 | +} |
| 43 | + |
| 44 | +; %val1 = load <4 x i8> |
| 45 | +; %op1 = zext<4 x i32> %val1 |
| 46 | +; %val2 = load <4 x i8> |
| 47 | +; %op2 = zext<4 x i32> %val2 |
| 48 | +; %rst = mul <4 x i32> %op1, %op2 |
| 49 | +; |
| 50 | +define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { |
| 51 | +; CHECK-LABEL: mul_4xi8: |
| 52 | +; CHECK: # BB#0: # %entry |
| 53 | +; CHECK-NEXT: movq {{.*}}(%rip), %rax |
| 54 | +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero |
| 55 | +; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero |
| 56 | +; CHECK-NEXT: pxor %xmm2, %xmm2 |
| 57 | +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] |
| 58 | +; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] |
| 59 | +; CHECK-NEXT: pmullw %xmm0, %xmm1 |
| 60 | +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] |
| 61 | +; CHECK-NEXT: movdqu %xmm1, (%rax,%rdx,4) |
| 62 | +; CHECK-NEXT: retq |
| 63 | +entry: |
| 64 | + %pre = load i32*, i32** @c |
| 65 | + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index |
| 66 | + %tmp7 = bitcast i8* %tmp6 to <4 x i8>* |
| 67 | + %wide.load = load <4 x i8>, <4 x i8>* %tmp7, align 1 |
| 68 | + %tmp8 = zext <4 x i8> %wide.load to <4 x i32> |
| 69 | + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index |
| 70 | + %tmp11 = bitcast i8* %tmp10 to <4 x i8>* |
| 71 | + %wide.load17 = load <4 x i8>, <4 x i8>* %tmp11, align 1 |
| 72 | + %tmp12 = zext <4 x i8> %wide.load17 to <4 x i32> |
| 73 | + %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8 |
| 74 | + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index |
| 75 | + %tmp15 = bitcast i32* %tmp14 to <4 x i32>* |
| 76 | + store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4 |
| 77 | + ret void |
| 78 | +} |
| 79 | + |
| 80 | +; %val1 = load <8 x i8> |
| 81 | +; %op1 = zext<8 x i32> %val1 |
| 82 | +; %val2 = load <8 x i8> |
| 83 | +; %op2 = zext<8 x i32> %val2 |
| 84 | +; %rst = mul <8 x i32> %op1, %op2 |
| 85 | +; |
| 86 | +define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { |
| 87 | +; CHECK-LABEL: mul_8xi8: |
| 88 | +; CHECK: # BB#0: # %entry |
| 89 | +; CHECK-NEXT: movq {{.*}}(%rip), %rax |
| 90 | +; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero |
| 91 | +; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero |
| 92 | +; CHECK-NEXT: pxor %xmm2, %xmm2 |
| 93 | +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] |
| 94 | +; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] |
| 95 | +; CHECK-NEXT: pmullw %xmm0, %xmm1 |
| 96 | +; CHECK-NEXT: movdqa %xmm1, %xmm0 |
| 97 | +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] |
| 98 | +; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] |
| 99 | +; CHECK-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) |
| 100 | +; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4) |
| 101 | +; CHECK-NEXT: retq |
| 102 | +entry: |
| 103 | + %pre = load i32*, i32** @c |
| 104 | + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index |
| 105 | + %tmp7 = bitcast i8* %tmp6 to <8 x i8>* |
| 106 | + %wide.load = load <8 x i8>, <8 x i8>* %tmp7, align 1 |
| 107 | + %tmp8 = zext <8 x i8> %wide.load to <8 x i32> |
| 108 | + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index |
| 109 | + %tmp11 = bitcast i8* %tmp10 to <8 x i8>* |
| 110 | + %wide.load17 = load <8 x i8>, <8 x i8>* %tmp11, align 1 |
| 111 | + %tmp12 = zext <8 x i8> %wide.load17 to <8 x i32> |
| 112 | + %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8 |
| 113 | + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index |
| 114 | + %tmp15 = bitcast i32* %tmp14 to <8 x i32>* |
| 115 | + store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4 |
| 116 | + ret void |
| 117 | +} |
| 118 | + |
| 119 | +; %val1 = load <16 x i8> |
| 120 | +; %op1 = zext<16 x i32> %val1 |
| 121 | +; %val2 = load <16 x i8> |
| 122 | +; %op2 = zext<16 x i32> %val2 |
| 123 | +; %rst = mul <16 x i32> %op1, %op2 |
| 124 | +; |
| 125 | +define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { |
| 126 | +; CHECK-LABEL: mul_16xi8: |
| 127 | +; CHECK: # BB#0: # %entry |
| 128 | +; CHECK-NEXT: movq {{.*}}(%rip), %rax |
| 129 | +; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0 |
| 130 | +; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm1 |
| 131 | +; CHECK-NEXT: pxor %xmm2, %xmm2 |
| 132 | +; CHECK-NEXT: movdqa %xmm0, %xmm3 |
| 133 | +; CHECK-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] |
| 134 | +; CHECK-NEXT: movdqa %xmm1, %xmm4 |
| 135 | +; CHECK-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] |
| 136 | +; CHECK-NEXT: pmullw %xmm3, %xmm4 |
| 137 | +; CHECK-NEXT: movdqa %xmm4, %xmm3 |
| 138 | +; CHECK-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] |
| 139 | +; CHECK-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] |
| 140 | +; CHECK-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] |
| 141 | +; CHECK-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] |
| 142 | +; CHECK-NEXT: pmullw %xmm0, %xmm1 |
| 143 | +; CHECK-NEXT: movdqa %xmm1, %xmm0 |
| 144 | +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] |
| 145 | +; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] |
| 146 | +; CHECK-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) |
| 147 | +; CHECK-NEXT: movdqu %xmm0, 32(%rax,%rdx,4) |
| 148 | +; CHECK-NEXT: movdqu %xmm4, 16(%rax,%rdx,4) |
| 149 | +; CHECK-NEXT: movdqu %xmm3, (%rax,%rdx,4) |
| 150 | +; CHECK-NEXT: retq |
| 151 | +entry: |
| 152 | + %pre = load i32*, i32** @c |
| 153 | + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index |
| 154 | + %tmp7 = bitcast i8* %tmp6 to <16 x i8>* |
| 155 | + %wide.load = load <16 x i8>, <16 x i8>* %tmp7, align 1 |
| 156 | + %tmp8 = zext <16 x i8> %wide.load to <16 x i32> |
| 157 | + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index |
| 158 | + %tmp11 = bitcast i8* %tmp10 to <16 x i8>* |
| 159 | + %wide.load17 = load <16 x i8>, <16 x i8>* %tmp11, align 1 |
| 160 | + %tmp12 = zext <16 x i8> %wide.load17 to <16 x i32> |
| 161 | + %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 |
| 162 | + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index |
| 163 | + %tmp15 = bitcast i32* %tmp14 to <16 x i32>* |
| 164 | + store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 |
| 165 | + ret void |
| 166 | +} |
| 167 | + |
| 168 | +; %val1 = load <2 x i16> |
| 169 | +; %op1 = zext<2 x i32> %val1 |
| 170 | +; %val2 = load <2 x i16> |
| 171 | +; %op2 = zext<2 x i32> %val2 |
| 172 | +; %rst = mul <2 x i32> %op1, %op2 |
| 173 | +; |
| 174 | +define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { |
| 175 | +; CHECK-LABEL: mul_2xi16: |
| 176 | +; CHECK: # BB#0: # %entry |
| 177 | +; CHECK-NEXT: movq {{.*}}(%rip), %rax |
| 178 | +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero |
| 179 | +; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero |
| 180 | +; CHECK-NEXT: movdqa %xmm1, %xmm2 |
| 181 | +; CHECK-NEXT: pmulhuw %xmm0, %xmm2 |
| 182 | +; CHECK-NEXT: pmullw %xmm0, %xmm1 |
| 183 | +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] |
| 184 | +; CHECK-NEXT: movq %xmm1, (%rax,%rdx,4) |
| 185 | +; CHECK-NEXT: retq |
| 186 | +entry: |
| 187 | + %pre = load i32*, i32** @c |
| 188 | + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index |
| 189 | + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* |
| 190 | + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 |
| 191 | + %tmp8 = zext <2 x i16> %wide.load to <2 x i32> |
| 192 | + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index |
| 193 | + %tmp11 = bitcast i8* %tmp10 to <2 x i16>* |
| 194 | + %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 |
| 195 | + %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32> |
| 196 | + %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 |
| 197 | + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index |
| 198 | + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* |
| 199 | + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 |
| 200 | + ret void |
| 201 | +} |
| 202 | + |
| 203 | +; %val1 = load <4 x i16> |
| 204 | +; %op1 = zext<4 x i32> %val1 |
| 205 | +; %val2 = load <4 x i16> |
| 206 | +; %op2 = zext<4 x i32> %val2 |
| 207 | +; %rst = mul <4 x i32> %op1, %op2 |
| 208 | +; |
| 209 | +define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { |
| 210 | +; CHECK-LABEL: mul_4xi16: |
| 211 | +; CHECK: # BB#0: # %entry |
| 212 | +; CHECK-NEXT: movq {{.*}}(%rip), %rax |
| 213 | +; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero |
| 214 | +; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero |
| 215 | +; CHECK-NEXT: movdqa %xmm1, %xmm2 |
| 216 | +; CHECK-NEXT: pmulhuw %xmm0, %xmm2 |
| 217 | +; CHECK-NEXT: pmullw %xmm0, %xmm1 |
| 218 | +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] |
| 219 | +; CHECK-NEXT: movdqu %xmm1, (%rax,%rdx,4) |
| 220 | +; CHECK-NEXT: retq |
| 221 | +entry: |
| 222 | + %pre = load i32*, i32** @c |
| 223 | + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index |
| 224 | + %tmp7 = bitcast i8* %tmp6 to <4 x i16>* |
| 225 | + %wide.load = load <4 x i16>, <4 x i16>* %tmp7, align 1 |
| 226 | + %tmp8 = zext <4 x i16> %wide.load to <4 x i32> |
| 227 | + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index |
| 228 | + %tmp11 = bitcast i8* %tmp10 to <4 x i16>* |
| 229 | + %wide.load17 = load <4 x i16>, <4 x i16>* %tmp11, align 1 |
| 230 | + %tmp12 = zext <4 x i16> %wide.load17 to <4 x i32> |
| 231 | + %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8 |
| 232 | + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index |
| 233 | + %tmp15 = bitcast i32* %tmp14 to <4 x i32>* |
| 234 | + store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4 |
| 235 | + ret void |
| 236 | +} |
| 237 | + |
| 238 | +; %val1 = load <8 x i16> |
| 239 | +; %op1 = zext<8 x i32> %val1 |
| 240 | +; %val2 = load <8 x i16> |
| 241 | +; %op2 = zext<8 x i32> %val2 |
| 242 | +; %rst = mul <8 x i32> %op1, %op2 |
| 243 | +; |
| 244 | +define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { |
| 245 | +; CHECK-LABEL: mul_8xi16: |
| 246 | +; CHECK: # BB#0: # %entry |
| 247 | +; CHECK-NEXT: movq {{.*}}(%rip), %rax |
| 248 | +; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0 |
| 249 | +; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm1 |
| 250 | +; CHECK-NEXT: movdqa %xmm1, %xmm2 |
| 251 | +; CHECK-NEXT: pmulhuw %xmm0, %xmm2 |
| 252 | +; CHECK-NEXT: pmullw %xmm0, %xmm1 |
| 253 | +; CHECK-NEXT: movdqa %xmm1, %xmm0 |
| 254 | +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] |
| 255 | +; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] |
| 256 | +; CHECK-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) |
| 257 | +; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4) |
| 258 | +; CHECK-NEXT: retq |
| 259 | +entry: |
| 260 | + %pre = load i32*, i32** @c |
| 261 | + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index |
| 262 | + %tmp7 = bitcast i8* %tmp6 to <8 x i16>* |
| 263 | + %wide.load = load <8 x i16>, <8 x i16>* %tmp7, align 1 |
| 264 | + %tmp8 = zext <8 x i16> %wide.load to <8 x i32> |
| 265 | + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index |
| 266 | + %tmp11 = bitcast i8* %tmp10 to <8 x i16>* |
| 267 | + %wide.load17 = load <8 x i16>, <8 x i16>* %tmp11, align 1 |
| 268 | + %tmp12 = zext <8 x i16> %wide.load17 to <8 x i32> |
| 269 | + %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8 |
| 270 | + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index |
| 271 | + %tmp15 = bitcast i32* %tmp14 to <8 x i32>* |
| 272 | + store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4 |
| 273 | + ret void |
| 274 | +} |
| 275 | + |
| 276 | +; %val1 = load <16 x i16> |
| 277 | +; %op1 = zext<16 x i32> %val1 |
| 278 | +; %val2 = load <16 x i16> |
| 279 | +; %op2 = zext<16 x i32> %val2 |
| 280 | +; %rst = mul <16 x i32> %op1, %op2 |
| 281 | +; |
| 282 | +define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { |
| 283 | +; CHECK-LABEL: mul_16xi16: |
| 284 | +; CHECK: # BB#0: # %entry |
| 285 | +; CHECK-NEXT: movq {{.*}}(%rip), %rax |
| 286 | +; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0 |
| 287 | +; CHECK-NEXT: movdqu 16(%rdi,%rdx), %xmm1 |
| 288 | +; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm2 |
| 289 | +; CHECK-NEXT: movdqu 16(%rsi,%rdx), %xmm3 |
| 290 | +; CHECK-NEXT: movdqa %xmm2, %xmm4 |
| 291 | +; CHECK-NEXT: pmulhuw %xmm0, %xmm4 |
| 292 | +; CHECK-NEXT: pmullw %xmm0, %xmm2 |
| 293 | +; CHECK-NEXT: movdqa %xmm2, %xmm0 |
| 294 | +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] |
| 295 | +; CHECK-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] |
| 296 | +; CHECK-NEXT: movdqa %xmm3, %xmm4 |
| 297 | +; CHECK-NEXT: pmulhuw %xmm1, %xmm4 |
| 298 | +; CHECK-NEXT: pmullw %xmm1, %xmm3 |
| 299 | +; CHECK-NEXT: movdqa %xmm3, %xmm1 |
| 300 | +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] |
| 301 | +; CHECK-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] |
| 302 | +; CHECK-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) |
| 303 | +; CHECK-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) |
| 304 | +; CHECK-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) |
| 305 | +; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4) |
| 306 | +; CHECK-NEXT: retq |
| 307 | +entry: |
| 308 | + %pre = load i32*, i32** @c |
| 309 | + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index |
| 310 | + %tmp7 = bitcast i8* %tmp6 to <16 x i16>* |
| 311 | + %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1 |
| 312 | + %tmp8 = zext <16 x i16> %wide.load to <16 x i32> |
| 313 | + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index |
| 314 | + %tmp11 = bitcast i8* %tmp10 to <16 x i16>* |
| 315 | + %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1 |
| 316 | + %tmp12 = zext <16 x i16> %wide.load17 to <16 x i32> |
| 317 | + %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 |
| 318 | + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index |
| 319 | + %tmp15 = bitcast i32* %tmp14 to <16 x i32>* |
| 320 | + store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 |
| 321 | + ret void |
| 322 | +} |
| 323 | + |
| 324 | +; %val1 = load <2 x i8> |
| 325 | +; %op1 = sext<2 x i32> %val1 |
| 326 | +; %val2 = load <2 x i8> |
| 327 | +; %op2 = sext<2 x i32> %val2 |
| 328 | +; %rst = mul <2 x i32> %op1, %op2 |
| 329 | +; |
| 330 | +define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { |
| 331 | +; CHECK-LABEL: mul_2xi8_sext: |
| 332 | +; CHECK: # BB#0: # %entry |
| 333 | +; CHECK-NEXT: movq {{.*}}(%rip), %rax |
| 334 | +; CHECK-NEXT: movzwl (%rdi,%rdx), %ecx |
| 335 | +; CHECK-NEXT: movd %ecx, %xmm0 |
| 336 | +; CHECK-NEXT: movzwl (%rsi,%rdx), %ecx |
| 337 | +; CHECK-NEXT: movd %ecx, %xmm1 |
| 338 | +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] |
| 339 | +; CHECK-NEXT: psraw $8, %xmm0 |
| 340 | +; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] |
| 341 | +; CHECK-NEXT: psraw $8, %xmm1 |
| 342 | +; CHECK-NEXT: pmullw %xmm0, %xmm1 |
| 343 | +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] |
| 344 | +; CHECK-NEXT: psrad $16, %xmm0 |
| 345 | +; CHECK-NEXT: movq %xmm0, (%rax,%rdx,4) |
| 346 | +; CHECK-NEXT: retq |
| 347 | +entry: |
| 348 | + %pre = load i32*, i32** @c |
| 349 | + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index |
| 350 | + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* |
| 351 | + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 |
| 352 | + %tmp8 = sext <2 x i8> %wide.load to <2 x i32> |
| 353 | + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index |
| 354 | + %tmp11 = bitcast i8* %tmp10 to <2 x i8>* |
| 355 | + %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 |
| 356 | + %tmp12 = sext <2 x i8> %wide.load17 to <2 x i32> |
| 357 | + %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 |
| 358 | + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index |
| 359 | + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* |
| 360 | + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 |
| 361 | + ret void |
| 362 | +} |
| 363 | + |
| 364 | +; %val1 = load <2 x i8> |
| 365 | +; %op1 = sext<2 x i32> %val1 |
| 366 | +; %val2 = load <2 x i8> |
| 367 | +; %op2 = zext<2 x i32> %val2 |
| 368 | +; %rst = mul <2 x i32> %op1, %op2 |
| 369 | +; |
| 370 | +define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { |
| 371 | +; CHECK-LABEL: mul_2xi8_sext_zext: |
| 372 | +; CHECK: # BB#0: # %entry |
| 373 | +; CHECK-NEXT: movq {{.*}}(%rip), %rax |
| 374 | +; CHECK-NEXT: movzwl (%rdi,%rdx), %ecx |
| 375 | +; CHECK-NEXT: movd %ecx, %xmm0 |
| 376 | +; CHECK-NEXT: movzwl (%rsi,%rdx), %ecx |
| 377 | +; CHECK-NEXT: movd %ecx, %xmm1 |
| 378 | +; CHECK-NEXT: pxor %xmm2, %xmm2 |
| 379 | +; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] |
| 380 | +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] |
| 381 | +; CHECK-NEXT: psraw $8, %xmm0 |
| 382 | +; CHECK-NEXT: movdqa %xmm1, %xmm2 |
| 383 | +; CHECK-NEXT: pmulhw %xmm0, %xmm2 |
| 384 | +; CHECK-NEXT: pmullw %xmm1, %xmm0 |
| 385 | +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] |
| 386 | +; CHECK-NEXT: movq %xmm0, (%rax,%rdx,4) |
| 387 | +; CHECK-NEXT: retq |
| 388 | +entry: |
| 389 | + %pre = load i32*, i32** @c |
| 390 | + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index |
| 391 | + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* |
| 392 | + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 |
| 393 | + %tmp8 = sext <2 x i8> %wide.load to <2 x i32> |
| 394 | + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index |
| 395 | + %tmp11 = bitcast i8* %tmp10 to <2 x i8>* |
| 396 | + %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 |
| 397 | + %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32> |
| 398 | + %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 |
| 399 | + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index |
| 400 | + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* |
| 401 | + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 |
| 402 | + ret void |
| 403 | +} |
| 404 | + |
| 405 | +; %val1 = load <2 x i16> |
| 406 | +; %op1 = sext<2 x i32> %val1 |
| 407 | +; %val2 = load <2 x i16> |
| 408 | +; %op2 = sext<2 x i32> %val2 |
| 409 | +; %rst = mul <2 x i32> %op1, %op2 |
| 410 | +; |
| 411 | +define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { |
| 412 | +; CHECK-LABEL: mul_2xi16_sext: |
| 413 | +; CHECK: # BB#0: # %entry |
| 414 | +; CHECK-NEXT: movq {{.*}}(%rip), %rax |
| 415 | +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero |
| 416 | +; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero |
| 417 | +; CHECK-NEXT: movdqa %xmm1, %xmm2 |
| 418 | +; CHECK-NEXT: pmulhw %xmm0, %xmm2 |
| 419 | +; CHECK-NEXT: pmullw %xmm0, %xmm1 |
| 420 | +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] |
| 421 | +; CHECK-NEXT: movq %xmm1, (%rax,%rdx,4) |
| 422 | +; CHECK-NEXT: retq |
| 423 | +entry: |
| 424 | + %pre = load i32*, i32** @c |
| 425 | + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index |
| 426 | + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* |
| 427 | + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 |
| 428 | + %tmp8 = sext <2 x i16> %wide.load to <2 x i32> |
| 429 | + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index |
| 430 | + %tmp11 = bitcast i8* %tmp10 to <2 x i16>* |
| 431 | + %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 |
| 432 | + %tmp12 = sext <2 x i16> %wide.load17 to <2 x i32> |
| 433 | + %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 |
| 434 | + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index |
| 435 | + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* |
| 436 | + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 |
| 437 | + ret void |
| 438 | +} |
| 439 | + |
| 440 | +; %val1 = load <2 x i16> |
| 441 | +; %op1 = sext<2 x i32> %val1 |
| 442 | +; %val2 = load <2 x i16> |
| 443 | +; %op2 = zext<2 x i32> %val2 |
| 444 | +; %rst = mul <2 x i32> %op1, %op2 |
| 445 | +; |
| 446 | +define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { |
| 447 | +; CHECK-LABEL: mul_2xi16_sext_zext: |
| 448 | +; CHECK: # BB#0: # %entry |
| 449 | +; CHECK-NEXT: movq {{.*}}(%rip), %rax |
| 450 | +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero |
| 451 | +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] |
| 452 | +; CHECK-NEXT: psrad $16, %xmm0 |
| 453 | +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] |
| 454 | +; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero |
| 455 | +; CHECK-NEXT: pxor %xmm2, %xmm2 |
| 456 | +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] |
| 457 | +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] |
| 458 | +; CHECK-NEXT: movdqa %xmm1, %xmm2 |
| 459 | +; CHECK-NEXT: pmuludq %xmm0, %xmm2 |
| 460 | +; CHECK-NEXT: movdqa %xmm0, %xmm3 |
| 461 | +; CHECK-NEXT: psrlq $32, %xmm3 |
| 462 | +; CHECK-NEXT: pmuludq %xmm1, %xmm3 |
| 463 | +; CHECK-NEXT: psllq $32, %xmm3 |
| 464 | +; CHECK-NEXT: paddq %xmm2, %xmm3 |
| 465 | +; CHECK-NEXT: psrlq $32, %xmm1 |
| 466 | +; CHECK-NEXT: pmuludq %xmm0, %xmm1 |
| 467 | +; CHECK-NEXT: psllq $32, %xmm1 |
| 468 | +; CHECK-NEXT: paddq %xmm3, %xmm1 |
| 469 | +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] |
| 470 | +; CHECK-NEXT: movq %xmm0, (%rax,%rdx,4) |
| 471 | +; CHECK-NEXT: retq |
| 472 | +entry: |
| 473 | + %pre = load i32*, i32** @c |
| 474 | + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index |
| 475 | + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* |
| 476 | + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 |
| 477 | + %tmp8 = sext <2 x i16> %wide.load to <2 x i32> |
| 478 | + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index |
| 479 | + %tmp11 = bitcast i8* %tmp10 to <2 x i16>* |
| 480 | + %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 |
| 481 | + %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32> |
| 482 | + %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 |
| 483 | + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index |
| 484 | + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* |
| 485 | + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 |
| 486 | + ret void |
| 487 | +} |
| 488 | + |
| 489 | +; %val1 = load <16 x i16> |
| 490 | +; %op1 = sext<16 x i32> %val1 |
| 491 | +; %val2 = load <16 x i16> |
| 492 | +; %op2 = sext<16 x i32> %val2 |
| 493 | +; %rst = mul <16 x i32> %op1, %op2 |
| 494 | +; |
| 495 | +define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { |
| 496 | +; CHECK-LABEL: mul_16xi16_sext: |
| 497 | +; CHECK: # BB#0: # %entry |
| 498 | +; CHECK-NEXT: movq {{.*}}(%rip), %rax |
| 499 | +; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0 |
| 500 | +; CHECK-NEXT: movdqu 16(%rdi,%rdx), %xmm1 |
| 501 | +; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm2 |
| 502 | +; CHECK-NEXT: movdqu 16(%rsi,%rdx), %xmm3 |
| 503 | +; CHECK-NEXT: movdqa %xmm2, %xmm4 |
| 504 | +; CHECK-NEXT: pmulhw %xmm0, %xmm4 |
| 505 | +; CHECK-NEXT: pmullw %xmm0, %xmm2 |
| 506 | +; CHECK-NEXT: movdqa %xmm2, %xmm0 |
| 507 | +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] |
| 508 | +; CHECK-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] |
| 509 | +; CHECK-NEXT: movdqa %xmm3, %xmm4 |
| 510 | +; CHECK-NEXT: pmulhw %xmm1, %xmm4 |
| 511 | +; CHECK-NEXT: pmullw %xmm1, %xmm3 |
| 512 | +; CHECK-NEXT: movdqa %xmm3, %xmm1 |
| 513 | +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] |
| 514 | +; CHECK-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] |
| 515 | +; CHECK-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) |
| 516 | +; CHECK-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) |
| 517 | +; CHECK-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) |
| 518 | +; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4) |
| 519 | +; CHECK-NEXT: retq |
| 520 | +entry: |
| 521 | + %pre = load i32*, i32** @c |
| 522 | + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index |
| 523 | + %tmp7 = bitcast i8* %tmp6 to <16 x i16>* |
| 524 | + %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1 |
| 525 | + %tmp8 = sext <16 x i16> %wide.load to <16 x i32> |
| 526 | + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index |
| 527 | + %tmp11 = bitcast i8* %tmp10 to <16 x i16>* |
| 528 | + %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1 |
| 529 | + %tmp12 = sext <16 x i16> %wide.load17 to <16 x i32> |
| 530 | + %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 |
| 531 | + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index |
| 532 | + %tmp15 = bitcast i32* %tmp14 to <16 x i32>* |
| 533 | + store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 |
| 534 | + ret void |
| 535 | +} |
| 536 | + |
| 537 | +; %val = load <2 x i8> |
| 538 | +; %op1 = zext<2 x i32> %val |
| 539 | +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 255) |
| 540 | +; %rst = mul <2 x i32> %op1, %op2 |
| 541 | +; |
| 542 | +define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) { |
| 543 | +; CHECK-LABEL: mul_2xi8_varconst1: |
| 544 | +; CHECK: # BB#0: # %entry |
| 545 | +; CHECK-NEXT: movq {{.*}}(%rip), %rax |
| 546 | +; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx |
| 547 | +; CHECK-NEXT: movd %ecx, %xmm0 |
| 548 | +; CHECK-NEXT: pxor %xmm1, %xmm1 |
| 549 | +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] |
| 550 | +; CHECK-NEXT: pmullw {{.*}}(%rip), %xmm0 |
| 551 | +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] |
| 552 | +; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) |
| 553 | +; CHECK-NEXT: retq |
| 554 | +entry: |
| 555 | + %pre = load i32*, i32** @c |
| 556 | + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index |
| 557 | + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* |
| 558 | + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 |
| 559 | + %tmp8 = zext <2 x i8> %wide.load to <2 x i32> |
| 560 | + %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 255> |
| 561 | + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index |
| 562 | + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* |
| 563 | + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 |
| 564 | + ret void |
| 565 | +} |
| 566 | + |
| 567 | +; %val = load <2 x i8> |
| 568 | +; %op1 = sext<2 x i32> %val |
| 569 | +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 127) |
| 570 | +; %rst = mul <2 x i32> %op1, %op2 |
| 571 | +; |
| 572 | +define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) { |
| 573 | +; CHECK-LABEL: mul_2xi8_varconst2: |
| 574 | +; CHECK: # BB#0: # %entry |
| 575 | +; CHECK-NEXT: movq {{.*}}(%rip), %rax |
| 576 | +; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx |
| 577 | +; CHECK-NEXT: movd %ecx, %xmm0 |
| 578 | +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] |
| 579 | +; CHECK-NEXT: psraw $8, %xmm0 |
| 580 | +; CHECK-NEXT: pmullw {{.*}}(%rip), %xmm0 |
| 581 | +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] |
| 582 | +; CHECK-NEXT: psrad $16, %xmm0 |
| 583 | +; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) |
| 584 | +; CHECK-NEXT: retq |
| 585 | +entry: |
| 586 | + %pre = load i32*, i32** @c |
| 587 | + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index |
| 588 | + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* |
| 589 | + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 |
| 590 | + %tmp8 = sext <2 x i8> %wide.load to <2 x i32> |
| 591 | + %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 127> |
| 592 | + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index |
| 593 | + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* |
| 594 | + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 |
| 595 | + ret void |
| 596 | +} |
| 597 | + |
| 598 | +; %val = load <2 x i8> |
| 599 | +; %op1 = zext<2 x i32> %val |
| 600 | +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 256) |
| 601 | +; %rst = mul <2 x i32> %op1, %op2 |
| 602 | +; |
| 603 | +define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) { |
| 604 | +; CHECK-LABEL: mul_2xi8_varconst3: |
| 605 | +; CHECK: # BB#0: # %entry |
| 606 | +; CHECK-NEXT: movq {{.*}}(%rip), %rax |
| 607 | +; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx |
| 608 | +; CHECK-NEXT: movd %ecx, %xmm0 |
| 609 | +; CHECK-NEXT: pxor %xmm1, %xmm1 |
| 610 | +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] |
| 611 | +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u> |
| 612 | +; CHECK-NEXT: movdqa %xmm0, %xmm2 |
| 613 | +; CHECK-NEXT: pmulhw %xmm1, %xmm2 |
| 614 | +; CHECK-NEXT: pmullw %xmm1, %xmm0 |
| 615 | +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] |
| 616 | +; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) |
| 617 | +; CHECK-NEXT: retq |
| 618 | +entry: |
| 619 | + %pre = load i32*, i32** @c |
| 620 | + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index |
| 621 | + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* |
| 622 | + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 |
| 623 | + %tmp8 = zext <2 x i8> %wide.load to <2 x i32> |
| 624 | + %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 256> |
| 625 | + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index |
| 626 | + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* |
| 627 | + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 |
| 628 | + ret void |
| 629 | +} |
| 630 | + |
| 631 | +; %val = load <2 x i8> |
| 632 | +; %op1 = zext<2 x i32> %val |
| 633 | +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-1 ~ 255) |
| 634 | +; %rst = mul <2 x i32> %op1, %op2 |
| 635 | +; |
| 636 | +define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) { |
| 637 | +; CHECK-LABEL: mul_2xi8_varconst4: |
| 638 | +; CHECK: # BB#0: # %entry |
| 639 | +; CHECK-NEXT: movq {{.*}}(%rip), %rax |
| 640 | +; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx |
| 641 | +; CHECK-NEXT: movd %ecx, %xmm0 |
| 642 | +; CHECK-NEXT: pxor %xmm1, %xmm1 |
| 643 | +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] |
| 644 | +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u> |
| 645 | +; CHECK-NEXT: movdqa %xmm0, %xmm2 |
| 646 | +; CHECK-NEXT: pmulhw %xmm1, %xmm2 |
| 647 | +; CHECK-NEXT: pmullw %xmm1, %xmm0 |
| 648 | +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] |
| 649 | +; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) |
| 650 | +; CHECK-NEXT: retq |
| 651 | +entry: |
| 652 | + %pre = load i32*, i32** @c |
| 653 | + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index |
| 654 | + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* |
| 655 | + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 |
| 656 | + %tmp8 = zext <2 x i8> %wide.load to <2 x i32> |
| 657 | + %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -1, i32 255> |
| 658 | + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index |
| 659 | + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* |
| 660 | + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 |
| 661 | + ret void |
| 662 | +} |
| 663 | + |
| 664 | +; %val = load <2 x i8> |
| 665 | +; %op1 = sext<2 x i32> %val |
| 666 | +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-129 ~ 127) |
| 667 | +; %rst = mul <2 x i32> %op1, %op2 |
| 668 | +; |
| 669 | +define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) { |
| 670 | +; CHECK-LABEL: mul_2xi8_varconst5: |
| 671 | +; CHECK: # BB#0: # %entry |
| 672 | +; CHECK-NEXT: movq {{.*}}(%rip), %rax |
| 673 | +; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx |
| 674 | +; CHECK-NEXT: movd %ecx, %xmm0 |
| 675 | +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] |
| 676 | +; CHECK-NEXT: psraw $8, %xmm0 |
| 677 | +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u> |
| 678 | +; CHECK-NEXT: movdqa %xmm0, %xmm2 |
| 679 | +; CHECK-NEXT: pmulhw %xmm1, %xmm2 |
| 680 | +; CHECK-NEXT: pmullw %xmm1, %xmm0 |
| 681 | +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] |
| 682 | +; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) |
| 683 | +; CHECK-NEXT: retq |
| 684 | +entry: |
| 685 | + %pre = load i32*, i32** @c |
| 686 | + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index |
| 687 | + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* |
| 688 | + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 |
| 689 | + %tmp8 = sext <2 x i8> %wide.load to <2 x i32> |
| 690 | + %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -129, i32 127> |
| 691 | + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index |
| 692 | + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* |
| 693 | + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 |
| 694 | + ret void |
| 695 | +} |
| 696 | + |
| 697 | +; %val = load <2 x i8> |
| 698 | +; %op1 = sext<2 x i32> %val |
| 699 | +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 128) |
| 700 | +; %rst = mul <2 x i32> %op1, %op2 |
| 701 | +; |
| 702 | +define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) { |
| 703 | +; CHECK-LABEL: mul_2xi8_varconst6: |
| 704 | +; CHECK: # BB#0: # %entry |
| 705 | +; CHECK-NEXT: movq {{.*}}(%rip), %rax |
| 706 | +; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx |
| 707 | +; CHECK-NEXT: movd %ecx, %xmm0 |
| 708 | +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] |
| 709 | +; CHECK-NEXT: psraw $8, %xmm0 |
| 710 | +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u> |
| 711 | +; CHECK-NEXT: movdqa %xmm0, %xmm2 |
| 712 | +; CHECK-NEXT: pmulhw %xmm1, %xmm2 |
| 713 | +; CHECK-NEXT: pmullw %xmm1, %xmm0 |
| 714 | +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] |
| 715 | +; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) |
| 716 | +; CHECK-NEXT: retq |
| 717 | +entry: |
| 718 | + %pre = load i32*, i32** @c |
| 719 | + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index |
| 720 | + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* |
| 721 | + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 |
| 722 | + %tmp8 = sext <2 x i8> %wide.load to <2 x i32> |
| 723 | + %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 128> |
| 724 | + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index |
| 725 | + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* |
| 726 | + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 |
| 727 | + ret void |
| 728 | +} |
| 729 | + |
| 730 | +; %val = load <2 x i16> |
| 731 | +; %op1 = zext<2 x i32> %val |
| 732 | +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65535) |
| 733 | +; %rst = mul <2 x i32> %op1, %op2 |
| 734 | +; |
| 735 | +define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) { |
| 736 | +; CHECK-LABEL: mul_2xi16_varconst1: |
| 737 | +; CHECK: # BB#0: # %entry |
| 738 | +; CHECK-NEXT: movq {{.*}}(%rip), %rax |
| 739 | +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero |
| 740 | +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u> |
| 741 | +; CHECK-NEXT: movdqa %xmm0, %xmm2 |
| 742 | +; CHECK-NEXT: pmulhuw %xmm1, %xmm2 |
| 743 | +; CHECK-NEXT: pmullw %xmm1, %xmm0 |
| 744 | +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] |
| 745 | +; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) |
| 746 | +; CHECK-NEXT: retq |
| 747 | +entry: |
| 748 | + %pre = load i32*, i32** @c |
| 749 | + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index |
| 750 | + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* |
| 751 | + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 |
| 752 | + %tmp8 = zext <2 x i16> %wide.load to <2 x i32> |
| 753 | + %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65535> |
| 754 | + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index |
| 755 | + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* |
| 756 | + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 |
| 757 | + ret void |
| 758 | +} |
| 759 | + |
| 760 | +; %val = load <2 x i16> |
| 761 | +; %op1 = sext<2 x i32> %val |
| 762 | +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-32768 ~ 32767) |
| 763 | +; %rst = mul <2 x i32> %op1, %op2 |
| 764 | +; |
| 765 | +define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) { |
| 766 | +; CHECK-LABEL: mul_2xi16_varconst2: |
| 767 | +; CHECK: # BB#0: # %entry |
| 768 | +; CHECK-NEXT: movq {{.*}}(%rip), %rax |
| 769 | +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero |
| 770 | +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u> |
| 771 | +; CHECK-NEXT: movdqa %xmm0, %xmm2 |
| 772 | +; CHECK-NEXT: pmulhw %xmm1, %xmm2 |
| 773 | +; CHECK-NEXT: pmullw %xmm1, %xmm0 |
| 774 | +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] |
| 775 | +; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) |
| 776 | +; CHECK-NEXT: retq |
| 777 | +entry: |
| 778 | + %pre = load i32*, i32** @c |
| 779 | + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index |
| 780 | + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* |
| 781 | + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 |
| 782 | + %tmp8 = sext <2 x i16> %wide.load to <2 x i32> |
| 783 | + %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -32768, i32 32767> |
| 784 | + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index |
| 785 | + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* |
| 786 | + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 |
| 787 | + ret void |
| 788 | +} |
| 789 | + |
| 790 | +; %val = load <2 x i16> |
| 791 | +; %op1 = zext<2 x i32> %val |
| 792 | +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65536) |
| 793 | +; %rst = mul <2 x i32> %op1, %op2 |
| 794 | +; |
| 795 | +define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) { |
| 796 | +; CHECK-LABEL: mul_2xi16_varconst3: |
| 797 | +; CHECK: # BB#0: # %entry |
| 798 | +; CHECK-NEXT: movq {{.*}}(%rip), %rax |
| 799 | +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero |
| 800 | +; CHECK-NEXT: pxor %xmm1, %xmm1 |
| 801 | +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] |
| 802 | +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] |
| 803 | +; CHECK-NEXT: movl $65536, %ecx # imm = 0x10000 |
| 804 | +; CHECK-NEXT: movd %rcx, %xmm1 |
| 805 | +; CHECK-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] |
| 806 | +; CHECK-NEXT: movdqa %xmm0, %xmm2 |
| 807 | +; CHECK-NEXT: pmuludq %xmm1, %xmm2 |
| 808 | +; CHECK-NEXT: psrlq $32, %xmm0 |
| 809 | +; CHECK-NEXT: pmuludq %xmm1, %xmm0 |
| 810 | +; CHECK-NEXT: psllq $32, %xmm0 |
| 811 | +; CHECK-NEXT: paddq %xmm2, %xmm0 |
| 812 | +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] |
| 813 | +; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) |
| 814 | +; CHECK-NEXT: retq |
| 815 | +entry: |
| 816 | + %pre = load i32*, i32** @c |
| 817 | + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index |
| 818 | + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* |
| 819 | + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 |
| 820 | + %tmp8 = zext <2 x i16> %wide.load to <2 x i32> |
| 821 | + %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65536> |
| 822 | + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index |
| 823 | + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* |
| 824 | + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 |
| 825 | + ret void |
| 826 | +} |
| 827 | + |
| 828 | +; %val = load <2 x i16> |
| 829 | +; %op1 = sext<2 x i32> %val |
| 830 | +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 32768) |
| 831 | +; %rst = mul <2 x i32> %op1, %op2 |
| 832 | +; |
| 833 | +define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) { |
| 834 | +; CHECK-LABEL: mul_2xi16_varconst4: |
| 835 | +; CHECK: # BB#0: # %entry |
| 836 | +; CHECK-NEXT: movq {{.*}}(%rip), %rax |
| 837 | +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero |
| 838 | +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] |
| 839 | +; CHECK-NEXT: psrad $16, %xmm0 |
| 840 | +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] |
| 841 | +; CHECK-NEXT: movl $32768, %ecx # imm = 0x8000 |
| 842 | +; CHECK-NEXT: movd %rcx, %xmm1 |
| 843 | +; CHECK-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] |
| 844 | +; CHECK-NEXT: movdqa %xmm0, %xmm2 |
| 845 | +; CHECK-NEXT: pmuludq %xmm1, %xmm2 |
| 846 | +; CHECK-NEXT: psrlq $32, %xmm0 |
| 847 | +; CHECK-NEXT: pmuludq %xmm1, %xmm0 |
| 848 | +; CHECK-NEXT: psllq $32, %xmm0 |
| 849 | +; CHECK-NEXT: paddq %xmm2, %xmm0 |
| 850 | +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] |
| 851 | +; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) |
| 852 | +; CHECK-NEXT: retq |
| 853 | +entry: |
| 854 | + %pre = load i32*, i32** @c |
| 855 | + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index |
| 856 | + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* |
| 857 | + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 |
| 858 | + %tmp8 = sext <2 x i16> %wide.load to <2 x i32> |
| 859 | + %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 32768> |
| 860 | + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index |
| 861 | + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* |
| 862 | + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 |
| 863 | + ret void |
| 864 | +} |
0 commit comments