@@ -209,6 +209,103 @@ define <4 x float> @test_select(float %f, float %g) {
209
209
ret <4 x float > %ret
210
210
}
211
211
212
+ ; We should optimize these two redundant insertqi into one
213
+ ; CHECK: define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i)
214
+ define <2 x i64 > @testInsertTwice (<2 x i64 > %v , <2 x i64 > %i ) {
215
+ ; CHECK: call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32)
216
+ ; CHECK-NOT: insertqi
217
+ %1 = tail call <2 x i64 > @llvm.x86.sse4a.insertqi (<2 x i64 > %v , <2 x i64 > %i , i8 32 , i8 32 )
218
+ %2 = tail call <2 x i64 > @llvm.x86.sse4a.insertqi (<2 x i64 > %1 , <2 x i64 > %i , i8 32 , i8 32 )
219
+ ret <2 x i64 > %2
220
+ }
221
+
222
+ ; The result of this insert is the second arg, since the top 64 bits of
223
+ ; the result are undefined, and we copy the bottom 64 bits from the
224
+ ; second arg
225
+ ; CHECK: define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i)
226
+ define <2 x i64 > @testInsert64Bits (<2 x i64 > %v , <2 x i64 > %i ) {
227
+ ; CHECK: ret <2 x i64> %i
228
+ %1 = tail call <2 x i64 > @llvm.x86.sse4a.insertqi (<2 x i64 > %v , <2 x i64 > %i , i8 64 , i8 0 )
229
+ ret <2 x i64 > %1
230
+ }
231
+
232
+ ; Test the several types of ranges and ordering that exist for two insertqi
233
+ ; CHECK: define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i)
234
+ define <2 x i64 > @testInsertContainedRange (<2 x i64 > %v , <2 x i64 > %i ) {
235
+ ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
236
+ ; CHECK: ret <2 x i64> %[[RES]]
237
+ %1 = tail call <2 x i64 > @llvm.x86.sse4a.insertqi (<2 x i64 > %v , <2 x i64 > %i , i8 32 , i8 0 )
238
+ %2 = tail call <2 x i64 > @llvm.x86.sse4a.insertqi (<2 x i64 > %1 , <2 x i64 > %i , i8 16 , i8 16 )
239
+ ret <2 x i64 > %2
240
+ }
241
+
242
+ ; CHECK: define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i)
243
+ define <2 x i64 > @testInsertContainedRange_2 (<2 x i64 > %v , <2 x i64 > %i ) {
244
+ ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
245
+ ; CHECK: ret <2 x i64> %[[RES]]
246
+ %1 = tail call <2 x i64 > @llvm.x86.sse4a.insertqi (<2 x i64 > %v , <2 x i64 > %i , i8 16 , i8 16 )
247
+ %2 = tail call <2 x i64 > @llvm.x86.sse4a.insertqi (<2 x i64 > %1 , <2 x i64 > %i , i8 32 , i8 0 )
248
+ ret <2 x i64 > %2
249
+ }
250
+
251
+ ; CHECK: define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i)
252
+ define <2 x i64 > @testInsertOverlappingRange (<2 x i64 > %v , <2 x i64 > %i ) {
253
+ ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
254
+ ; CHECK: ret <2 x i64> %[[RES]]
255
+ %1 = tail call <2 x i64 > @llvm.x86.sse4a.insertqi (<2 x i64 > %v , <2 x i64 > %i , i8 32 , i8 0 )
256
+ %2 = tail call <2 x i64 > @llvm.x86.sse4a.insertqi (<2 x i64 > %1 , <2 x i64 > %i , i8 32 , i8 16 )
257
+ ret <2 x i64 > %2
258
+ }
259
+
260
+ ; CHECK: define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i)
261
+ define <2 x i64 > @testInsertOverlappingRange_2 (<2 x i64 > %v , <2 x i64 > %i ) {
262
+ ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
263
+ ; CHECK: ret <2 x i64> %[[RES]]
264
+ %1 = tail call <2 x i64 > @llvm.x86.sse4a.insertqi (<2 x i64 > %v , <2 x i64 > %i , i8 32 , i8 16 )
265
+ %2 = tail call <2 x i64 > @llvm.x86.sse4a.insertqi (<2 x i64 > %1 , <2 x i64 > %i , i8 32 , i8 0 )
266
+ ret <2 x i64 > %2
267
+ }
268
+
269
+ ; CHECK: define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i)
270
+ define <2 x i64 > @testInsertAdjacentRange (<2 x i64 > %v , <2 x i64 > %i ) {
271
+ ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
272
+ ; CHECK: ret <2 x i64> %[[RES]]
273
+ %1 = tail call <2 x i64 > @llvm.x86.sse4a.insertqi (<2 x i64 > %v , <2 x i64 > %i , i8 32 , i8 0 )
274
+ %2 = tail call <2 x i64 > @llvm.x86.sse4a.insertqi (<2 x i64 > %1 , <2 x i64 > %i , i8 16 , i8 32 )
275
+ ret <2 x i64 > %2
276
+ }
277
+
278
+ ; CHECK: define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i)
279
+ define <2 x i64 > @testInsertAdjacentRange_2 (<2 x i64 > %v , <2 x i64 > %i ) {
280
+ ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
281
+ ; CHECK: ret <2 x i64> %[[RES]]
282
+ %1 = tail call <2 x i64 > @llvm.x86.sse4a.insertqi (<2 x i64 > %v , <2 x i64 > %i , i8 16 , i8 32 )
283
+ %2 = tail call <2 x i64 > @llvm.x86.sse4a.insertqi (<2 x i64 > %1 , <2 x i64 > %i , i8 32 , i8 0 )
284
+ ret <2 x i64 > %2
285
+ }
286
+
287
+ ; CHECK: define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i)
288
+ define <2 x i64 > @testInsertDisjointRange (<2 x i64 > %v , <2 x i64 > %i ) {
289
+ ; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
290
+ ; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
291
+ %1 = tail call <2 x i64 > @llvm.x86.sse4a.insertqi (<2 x i64 > %v , <2 x i64 > %i , i8 16 , i8 0 )
292
+ %2 = tail call <2 x i64 > @llvm.x86.sse4a.insertqi (<2 x i64 > %1 , <2 x i64 > %i , i8 16 , i8 32 )
293
+ ret <2 x i64 > %2
294
+ }
295
+
296
+ ; CHECK: define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i)
297
+ define <2 x i64 > @testInsertDisjointRange_2 (<2 x i64 > %v , <2 x i64 > %i ) {
298
+ ; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
299
+ ; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
300
+ %1 = tail call <2 x i64 > @llvm.x86.sse4a.insertqi (<2 x i64 > %v , <2 x i64 > %i , i8 16 , i8 0 )
301
+ %2 = tail call <2 x i64 > @llvm.x86.sse4a.insertqi (<2 x i64 > %1 , <2 x i64 > %i , i8 16 , i8 32 )
302
+ ret <2 x i64 > %2
303
+ }
304
+
305
+
306
+ ; CHECK: declare <2 x i64> @llvm.x86.sse4a.insertqi
307
+ declare <2 x i64 > @llvm.x86.sse4a.insertqi (<2 x i64 >, <2 x i64 >, i8 , i8 ) nounwind
308
+
212
309
declare <4 x float > @llvm.x86.avx.vpermilvar.ps (<4 x float >, <4 x i32 >)
213
310
define <4 x float > @test_vpermilvar_ps (<4 x float > %v ) {
214
311
; CHECK-LABEL: @test_vpermilvar_ps(
0 commit comments