Page MenuHomePhabricator

No OneTemporary

File Metadata

Created
Mon, Jun 24, 8:28 PM
This file is larger than 256 KB, so syntax highlighting was skipped.
Index: llvm/trunk/test/CodeGen/ARM/arm-storebytesmerge.ll
===================================================================
--- llvm/trunk/test/CodeGen/ARM/arm-storebytesmerge.ll (revision 357115)
+++ llvm/trunk/test/CodeGen/ARM/arm-storebytesmerge.ll (revision 357116)
@@ -1,341 +1,341 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "thumbv7em-arm-none-eabi"
; Function Attrs: nounwind
define arm_aapcs_vfpcc void @test(i8* %v50) #0 {
; CHECK-LABEL: test:
; CHECK: @ %bb.0:
+; CHECK-NEXT: movw r1, #65534
+; CHECK-NEXT: strh.w r1, [r0, #510]
; CHECK-NEXT: movw r1, #64506
; CHECK-NEXT: movt r1, #65020
; CHECK-NEXT: str.w r1, [r0, #506]
; CHECK-NEXT: movw r1, #63478
; CHECK-NEXT: movt r1, #63992
; CHECK-NEXT: str.w r1, [r0, #502]
; CHECK-NEXT: movw r1, #62450
; CHECK-NEXT: movt r1, #62964
; CHECK-NEXT: str.w r1, [r0, #498]
; CHECK-NEXT: movw r1, #61422
; CHECK-NEXT: movt r1, #61936
; CHECK-NEXT: str.w r1, [r0, #494]
; CHECK-NEXT: movw r1, #60394
; CHECK-NEXT: movt r1, #60908
; CHECK-NEXT: str.w r1, [r0, #490]
; CHECK-NEXT: movw r1, #59366
; CHECK-NEXT: movt r1, #59880
; CHECK-NEXT: str.w r1, [r0, #486]
; CHECK-NEXT: movw r1, #58338
; CHECK-NEXT: movt r1, #58852
; CHECK-NEXT: str.w r1, [r0, #482]
; CHECK-NEXT: movw r1, #57310
; CHECK-NEXT: movt r1, #57824
; CHECK-NEXT: str.w r1, [r0, #478]
; CHECK-NEXT: movw r1, #56282
; CHECK-NEXT: movt r1, #56796
; CHECK-NEXT: str.w r1, [r0, #474]
; CHECK-NEXT: movw r1, #55254
; CHECK-NEXT: movt r1, #55768
; CHECK-NEXT: str.w r1, [r0, #470]
; CHECK-NEXT: movw r1, #54226
; CHECK-NEXT: movt r1, #54740
; CHECK-NEXT: str.w r1, [r0, #466]
; CHECK-NEXT: movw r1, #53198
; CHECK-NEXT: movt r1, #53712
; CHECK-NEXT: str.w r1, [r0, #462]
; CHECK-NEXT: movw r1, #52170
; CHECK-NEXT: movt r1, #52684
; CHECK-NEXT: str.w r1, [r0, #458]
; CHECK-NEXT: movw r1, #51142
; CHECK-NEXT: movt r1, #51656
; CHECK-NEXT: str.w r1, [r0, #454]
; CHECK-NEXT: movw r1, #50114
; CHECK-NEXT: movt r1, #50628
; CHECK-NEXT: str.w r1, [r0, #450]
; CHECK-NEXT: movw r1, #49086
; CHECK-NEXT: movt r1, #49600
; CHECK-NEXT: str.w r1, [r0, #446]
; CHECK-NEXT: movw r1, #48058
; CHECK-NEXT: movt r1, #48572
; CHECK-NEXT: str.w r1, [r0, #442]
; CHECK-NEXT: movw r1, #47030
; CHECK-NEXT: movt r1, #47544
; CHECK-NEXT: str.w r1, [r0, #438]
; CHECK-NEXT: movw r1, #46002
; CHECK-NEXT: movt r1, #46516
; CHECK-NEXT: str.w r1, [r0, #434]
; CHECK-NEXT: movw r1, #44974
; CHECK-NEXT: movt r1, #45488
; CHECK-NEXT: str.w r1, [r0, #430]
; CHECK-NEXT: movw r1, #43946
; CHECK-NEXT: movt r1, #44460
; CHECK-NEXT: str.w r1, [r0, #426]
; CHECK-NEXT: movw r1, #42918
; CHECK-NEXT: movt r1, #43432
; CHECK-NEXT: str.w r1, [r0, #422]
; CHECK-NEXT: movw r1, #41890
; CHECK-NEXT: movt r1, #42404
; CHECK-NEXT: str.w r1, [r0, #418]
; CHECK-NEXT: movw r1, #40862
; CHECK-NEXT: movt r1, #41376
; CHECK-NEXT: str.w r1, [r0, #414]
; CHECK-NEXT: movw r1, #39834
; CHECK-NEXT: movt r1, #40348
; CHECK-NEXT: str.w r1, [r0, #410]
; CHECK-NEXT: movw r1, #38806
; CHECK-NEXT: movt r1, #39320
; CHECK-NEXT: str.w r1, [r0, #406]
; CHECK-NEXT: movw r1, #37778
; CHECK-NEXT: movt r1, #38292
; CHECK-NEXT: str.w r1, [r0, #402]
; CHECK-NEXT: movw r1, #36750
; CHECK-NEXT: movt r1, #37264
; CHECK-NEXT: str.w r1, [r0, #398]
-; CHECK-NEXT: movw r1, #65534
-; CHECK-NEXT: strh.w r1, [r0, #510]
; CHECK-NEXT: movw r1, #35722
; CHECK-NEXT: movt r1, #36236
; CHECK-NEXT: str.w r1, [r0, #394]
; CHECK-NEXT: bx lr
%v190 = getelementptr inbounds i8, i8* %v50, i32 394
store i8 -118, i8* %v190, align 1
%v191 = getelementptr inbounds i8, i8* %v50, i32 395
store i8 -117, i8* %v191, align 1
%v192 = getelementptr inbounds i8, i8* %v50, i32 396
store i8 -116, i8* %v192, align 1
%v193 = getelementptr inbounds i8, i8* %v50, i32 397
store i8 -115, i8* %v193, align 1
%v194 = getelementptr inbounds i8, i8* %v50, i32 398
store i8 -114, i8* %v194, align 1
%v195 = getelementptr inbounds i8, i8* %v50, i32 399
store i8 -113, i8* %v195, align 1
%v196 = getelementptr inbounds i8, i8* %v50, i32 400
store i8 -112, i8* %v196, align 1
%v197 = getelementptr inbounds i8, i8* %v50, i32 401
store i8 -111, i8* %v197, align 1
%v198 = getelementptr inbounds i8, i8* %v50, i32 402
store i8 -110, i8* %v198, align 1
%v199 = getelementptr inbounds i8, i8* %v50, i32 403
store i8 -109, i8* %v199, align 1
%v200 = getelementptr inbounds i8, i8* %v50, i32 404
store i8 -108, i8* %v200, align 1
%v201 = getelementptr inbounds i8, i8* %v50, i32 405
store i8 -107, i8* %v201, align 1
%v202 = getelementptr inbounds i8, i8* %v50, i32 406
store i8 -106, i8* %v202, align 1
%v203 = getelementptr inbounds i8, i8* %v50, i32 407
store i8 -105, i8* %v203, align 1
%v204 = getelementptr inbounds i8, i8* %v50, i32 408
store i8 -104, i8* %v204, align 1
%v205 = getelementptr inbounds i8, i8* %v50, i32 409
store i8 -103, i8* %v205, align 1
%v206 = getelementptr inbounds i8, i8* %v50, i32 410
store i8 -102, i8* %v206, align 1
%v207 = getelementptr inbounds i8, i8* %v50, i32 411
store i8 -101, i8* %v207, align 1
%v208 = getelementptr inbounds i8, i8* %v50, i32 412
store i8 -100, i8* %v208, align 1
%v209 = getelementptr inbounds i8, i8* %v50, i32 413
store i8 -99, i8* %v209, align 1
%v210 = getelementptr inbounds i8, i8* %v50, i32 414
store i8 -98, i8* %v210, align 1
%v211 = getelementptr inbounds i8, i8* %v50, i32 415
store i8 -97, i8* %v211, align 1
%v212 = getelementptr inbounds i8, i8* %v50, i32 416
store i8 -96, i8* %v212, align 1
%v213 = getelementptr inbounds i8, i8* %v50, i32 417
store i8 -95, i8* %v213, align 1
%v214 = getelementptr inbounds i8, i8* %v50, i32 418
store i8 -94, i8* %v214, align 1
%v215 = getelementptr inbounds i8, i8* %v50, i32 419
store i8 -93, i8* %v215, align 1
%v216 = getelementptr inbounds i8, i8* %v50, i32 420
store i8 -92, i8* %v216, align 1
%v217 = getelementptr inbounds i8, i8* %v50, i32 421
store i8 -91, i8* %v217, align 1
%v218 = getelementptr inbounds i8, i8* %v50, i32 422
store i8 -90, i8* %v218, align 1
%v219 = getelementptr inbounds i8, i8* %v50, i32 423
store i8 -89, i8* %v219, align 1
%v220 = getelementptr inbounds i8, i8* %v50, i32 424
store i8 -88, i8* %v220, align 1
%v221 = getelementptr inbounds i8, i8* %v50, i32 425
store i8 -87, i8* %v221, align 1
%v222 = getelementptr inbounds i8, i8* %v50, i32 426
store i8 -86, i8* %v222, align 1
%v223 = getelementptr inbounds i8, i8* %v50, i32 427
store i8 -85, i8* %v223, align 1
%v224 = getelementptr inbounds i8, i8* %v50, i32 428
store i8 -84, i8* %v224, align 1
%v225 = getelementptr inbounds i8, i8* %v50, i32 429
store i8 -83, i8* %v225, align 1
%v226 = getelementptr inbounds i8, i8* %v50, i32 430
store i8 -82, i8* %v226, align 1
%v227 = getelementptr inbounds i8, i8* %v50, i32 431
store i8 -81, i8* %v227, align 1
%v228 = getelementptr inbounds i8, i8* %v50, i32 432
store i8 -80, i8* %v228, align 1
%v229 = getelementptr inbounds i8, i8* %v50, i32 433
store i8 -79, i8* %v229, align 1
%v230 = getelementptr inbounds i8, i8* %v50, i32 434
store i8 -78, i8* %v230, align 1
%v231 = getelementptr inbounds i8, i8* %v50, i32 435
store i8 -77, i8* %v231, align 1
%v232 = getelementptr inbounds i8, i8* %v50, i32 436
store i8 -76, i8* %v232, align 1
%v233 = getelementptr inbounds i8, i8* %v50, i32 437
store i8 -75, i8* %v233, align 1
%v234 = getelementptr inbounds i8, i8* %v50, i32 438
store i8 -74, i8* %v234, align 1
%v235 = getelementptr inbounds i8, i8* %v50, i32 439
store i8 -73, i8* %v235, align 1
%v236 = getelementptr inbounds i8, i8* %v50, i32 440
store i8 -72, i8* %v236, align 1
%v237 = getelementptr inbounds i8, i8* %v50, i32 441
store i8 -71, i8* %v237, align 1
%v238 = getelementptr inbounds i8, i8* %v50, i32 442
store i8 -70, i8* %v238, align 1
%v239 = getelementptr inbounds i8, i8* %v50, i32 443
store i8 -69, i8* %v239, align 1
%v240 = getelementptr inbounds i8, i8* %v50, i32 444
store i8 -68, i8* %v240, align 1
%v241 = getelementptr inbounds i8, i8* %v50, i32 445
store i8 -67, i8* %v241, align 1
%v242 = getelementptr inbounds i8, i8* %v50, i32 446
store i8 -66, i8* %v242, align 1
%v243 = getelementptr inbounds i8, i8* %v50, i32 447
store i8 -65, i8* %v243, align 1
%v244 = getelementptr inbounds i8, i8* %v50, i32 448
store i8 -64, i8* %v244, align 1
%v245 = getelementptr inbounds i8, i8* %v50, i32 449
store i8 -63, i8* %v245, align 1
%v246 = getelementptr inbounds i8, i8* %v50, i32 450
store i8 -62, i8* %v246, align 1
%v247 = getelementptr inbounds i8, i8* %v50, i32 451
store i8 -61, i8* %v247, align 1
%v248 = getelementptr inbounds i8, i8* %v50, i32 452
store i8 -60, i8* %v248, align 1
%v249 = getelementptr inbounds i8, i8* %v50, i32 453
store i8 -59, i8* %v249, align 1
%v250 = getelementptr inbounds i8, i8* %v50, i32 454
store i8 -58, i8* %v250, align 1
%v251 = getelementptr inbounds i8, i8* %v50, i32 455
store i8 -57, i8* %v251, align 1
%v252 = getelementptr inbounds i8, i8* %v50, i32 456
store i8 -56, i8* %v252, align 1
%v253 = getelementptr inbounds i8, i8* %v50, i32 457
store i8 -55, i8* %v253, align 1
%v254 = getelementptr inbounds i8, i8* %v50, i32 458
store i8 -54, i8* %v254, align 1
%v255 = getelementptr inbounds i8, i8* %v50, i32 459
store i8 -53, i8* %v255, align 1
%v256 = getelementptr inbounds i8, i8* %v50, i32 460
store i8 -52, i8* %v256, align 1
%v257 = getelementptr inbounds i8, i8* %v50, i32 461
store i8 -51, i8* %v257, align 1
%v258 = getelementptr inbounds i8, i8* %v50, i32 462
store i8 -50, i8* %v258, align 1
%v259 = getelementptr inbounds i8, i8* %v50, i32 463
store i8 -49, i8* %v259, align 1
%v260 = getelementptr inbounds i8, i8* %v50, i32 464
store i8 -48, i8* %v260, align 1
%v261 = getelementptr inbounds i8, i8* %v50, i32 465
store i8 -47, i8* %v261, align 1
%v262 = getelementptr inbounds i8, i8* %v50, i32 466
store i8 -46, i8* %v262, align 1
%v263 = getelementptr inbounds i8, i8* %v50, i32 467
store i8 -45, i8* %v263, align 1
%v264 = getelementptr inbounds i8, i8* %v50, i32 468
store i8 -44, i8* %v264, align 1
%v265 = getelementptr inbounds i8, i8* %v50, i32 469
store i8 -43, i8* %v265, align 1
%v266 = getelementptr inbounds i8, i8* %v50, i32 470
store i8 -42, i8* %v266, align 1
%v267 = getelementptr inbounds i8, i8* %v50, i32 471
store i8 -41, i8* %v267, align 1
%v268 = getelementptr inbounds i8, i8* %v50, i32 472
store i8 -40, i8* %v268, align 1
%v269 = getelementptr inbounds i8, i8* %v50, i32 473
store i8 -39, i8* %v269, align 1
%v270 = getelementptr inbounds i8, i8* %v50, i32 474
store i8 -38, i8* %v270, align 1
%v271 = getelementptr inbounds i8, i8* %v50, i32 475
store i8 -37, i8* %v271, align 1
%v272 = getelementptr inbounds i8, i8* %v50, i32 476
store i8 -36, i8* %v272, align 1
%v273 = getelementptr inbounds i8, i8* %v50, i32 477
store i8 -35, i8* %v273, align 1
%v274 = getelementptr inbounds i8, i8* %v50, i32 478
store i8 -34, i8* %v274, align 1
%v275 = getelementptr inbounds i8, i8* %v50, i32 479
store i8 -33, i8* %v275, align 1
%v276 = getelementptr inbounds i8, i8* %v50, i32 480
store i8 -32, i8* %v276, align 1
%v277 = getelementptr inbounds i8, i8* %v50, i32 481
store i8 -31, i8* %v277, align 1
%v278 = getelementptr inbounds i8, i8* %v50, i32 482
store i8 -30, i8* %v278, align 1
%v279 = getelementptr inbounds i8, i8* %v50, i32 483
store i8 -29, i8* %v279, align 1
%v280 = getelementptr inbounds i8, i8* %v50, i32 484
store i8 -28, i8* %v280, align 1
%v281 = getelementptr inbounds i8, i8* %v50, i32 485
store i8 -27, i8* %v281, align 1
%v282 = getelementptr inbounds i8, i8* %v50, i32 486
store i8 -26, i8* %v282, align 1
%v283 = getelementptr inbounds i8, i8* %v50, i32 487
store i8 -25, i8* %v283, align 1
%v284 = getelementptr inbounds i8, i8* %v50, i32 488
store i8 -24, i8* %v284, align 1
%v285 = getelementptr inbounds i8, i8* %v50, i32 489
store i8 -23, i8* %v285, align 1
%v286 = getelementptr inbounds i8, i8* %v50, i32 490
store i8 -22, i8* %v286, align 1
%v287 = getelementptr inbounds i8, i8* %v50, i32 491
store i8 -21, i8* %v287, align 1
%v288 = getelementptr inbounds i8, i8* %v50, i32 492
store i8 -20, i8* %v288, align 1
%v289 = getelementptr inbounds i8, i8* %v50, i32 493
store i8 -19, i8* %v289, align 1
%v290 = getelementptr inbounds i8, i8* %v50, i32 494
store i8 -18, i8* %v290, align 1
%v291 = getelementptr inbounds i8, i8* %v50, i32 495
store i8 -17, i8* %v291, align 1
%v292 = getelementptr inbounds i8, i8* %v50, i32 496
store i8 -16, i8* %v292, align 1
%v293 = getelementptr inbounds i8, i8* %v50, i32 497
store i8 -15, i8* %v293, align 1
%v294 = getelementptr inbounds i8, i8* %v50, i32 498
store i8 -14, i8* %v294, align 1
%v295 = getelementptr inbounds i8, i8* %v50, i32 499
store i8 -13, i8* %v295, align 1
%v296 = getelementptr inbounds i8, i8* %v50, i32 500
store i8 -12, i8* %v296, align 1
%v297 = getelementptr inbounds i8, i8* %v50, i32 501
store i8 -11, i8* %v297, align 1
%v298 = getelementptr inbounds i8, i8* %v50, i32 502
store i8 -10, i8* %v298, align 1
%v299 = getelementptr inbounds i8, i8* %v50, i32 503
store i8 -9, i8* %v299, align 1
%v300 = getelementptr inbounds i8, i8* %v50, i32 504
store i8 -8, i8* %v300, align 1
%v301 = getelementptr inbounds i8, i8* %v50, i32 505
store i8 -7, i8* %v301, align 1
%v302 = getelementptr inbounds i8, i8* %v50, i32 506
store i8 -6, i8* %v302, align 1
%v303 = getelementptr inbounds i8, i8* %v50, i32 507
store i8 -5, i8* %v303, align 1
%v304 = getelementptr inbounds i8, i8* %v50, i32 508
store i8 -4, i8* %v304, align 1
%v305 = getelementptr inbounds i8, i8* %v50, i32 509
store i8 -3, i8* %v305, align 1
%v306 = getelementptr inbounds i8, i8* %v50, i32 510
store i8 -2, i8* %v306, align 1
%v307 = getelementptr inbounds i8, i8* %v50, i32 511
store i8 -1, i8* %v307, align 1
ret void
}
attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m7" "target-features"="+d16,+dsp,+fp-armv8,+hwdiv,+thumb-mode,-crc,-crypto,-dotprod,-fp-only-sp,-fullfp16,-hwdiv-arm,-neon,-ras" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: llvm/trunk/test/CodeGen/ARM/CGP/arm-cgp-icmps.ll
===================================================================
--- llvm/trunk/test/CodeGen/ARM/CGP/arm-cgp-icmps.ll (revision 357115)
+++ llvm/trunk/test/CodeGen/ARM/CGP/arm-cgp-icmps.ll (revision 357116)
@@ -1,332 +1,332 @@
; RUN: llc -mtriple=thumbv8m.main -mcpu=cortex-m33 %s -arm-disable-cgp=false -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-NODSP
; RUN: llc -mtriple=thumbv7em %s -arm-disable-cgp=false -arm-enable-scalar-dsp=true -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP
; RUN: llc -mtriple=thumbv8 %s -arm-disable-cgp=false -arm-enable-scalar-dsp=true -arm-enable-scalar-dsp-imms=true -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP-IMM
; CHECK-COMMON-LABEL: test_ult_254_inc_imm:
; CHECK-DSP: adds r0, #1
; CHECK-DSP-NEXT: uxtb r1, r0
; CHECK-DSP-NEXT: movs r0, #47
; CHECK-DSP-NEXT: cmp r1, #254
; CHECK-DSP-NEXT: it lo
; CHECK-DSP-NEXT: movlo r0, #35
; CHECK-DSP-IMM: movs r1, #1
; CHECK-DSP-IMM-NEXT: uadd8 r1, r0, r1
; CHECK-DSP-IMM-NEXT: movs r0, #47
; CHECK-DSP-IMM-NEXT: cmp r1, #254
; CHECK-DSP-IMM-NEXT: it lo
; CHECK-DSP-IMM-NEXT: movlo r0, #35
define i32 @test_ult_254_inc_imm(i8 zeroext %x) {
entry:
%add = add i8 %x, 1
%cmp = icmp ult i8 %add, 254
%res = select i1 %cmp, i32 35, i32 47
ret i32 %res
}
; CHECK-COMMON-LABEL: test_slt_254_inc_imm
; CHECK-COMMON: adds
; CHECK-COMMON: sxtb
define i32 @test_slt_254_inc_imm(i8 signext %x) {
entry:
%add = add i8 %x, 1
%cmp = icmp slt i8 %add, 254
%res = select i1 %cmp, i32 35, i32 47
ret i32 %res
}
; CHECK-COMMON-LABEL: test_ult_254_inc_var:
; CHECK-NODSP: add r0, r1
; CHECK-NODSP-NEXT: uxtb r1, r0
; CHECK-NODSP-NEXT: movs r0, #47
; CHECK-NODSP-NEXT: cmp r1, #254
; CHECK-NODSP-NEXT: it lo
; CHECK-NODSP-NEXT: movlo r0, #35
; CHECK-DSP: uadd8 r1, r0, r1
; CHECK-DSP-NEXT: movs r0, #47
; CHECK-DSP-NEXT: cmp r1, #254
; CHECK-DSP-NEXT: it lo
; CHECK-DSP-NEXT: movlo r0, #35
define i32 @test_ult_254_inc_var(i8 zeroext %x, i8 zeroext %y) {
entry:
%add = add i8 %x, %y
%cmp = icmp ult i8 %add, 254
%res = select i1 %cmp, i32 35, i32 47
ret i32 %res
}
; CHECK-COMMON-LABEL: test_sle_254_inc_var
; CHECK-COMMON: add
; CHECK-COMMON: sxtb
; CHECK-COMMON: cmp
define i32 @test_sle_254_inc_var(i8 %x, i8 %y) {
entry:
%add = add i8 %x, %y
%cmp = icmp sle i8 %add, 254
%res = select i1 %cmp, i32 35, i32 47
ret i32 %res
}
; CHECK-COMMON-LABEL: test_ugt_1_dec_imm:
; CHECK-COMMON: subs r1, r0, #1
; CHECK-COMMON-NEXT: movs r0, #47
; CHECK-COMMON-NEXT: cmp r1, #1
; CHECK-COMMON-NEXT: it hi
; CHECK-COMMON-NEXT: movhi r0, #35
define i32 @test_ugt_1_dec_imm(i8 zeroext %x) {
entry:
%add = add i8 %x, -1
%cmp = icmp ugt i8 %add, 1
%res = select i1 %cmp, i32 35, i32 47
ret i32 %res
}
; CHECK-COMMON-LABEL: test_sgt_1_dec_imm
; CHECK-COMMON: subs
; CHECK-COMMON: sxtb
; CHECK-COMMON: cmp
define i32 @test_sgt_1_dec_imm(i8 %x) {
entry:
%add = add i8 %x, -1
%cmp = icmp sgt i8 %add, 1
%res = select i1 %cmp, i32 35, i32 47
ret i32 %res
}
; CHECK-COMMON-LABEL: test_ugt_1_dec_var:
; CHECK-NODSP: subs r0, r0, r1
; CHECK-NODSP-NEXT: uxtb r1, r0
; CHECK-NODSP-NEXT: movs r0, #47
; CHECK-NODSP-NEXT: cmp r1, #1
; CHECK-NODSP-NEXT: it hi
; CHECK-NODSP-NEXT: movhi r0, #35
; CHECK-DSP: usub8 r1, r0, r1
; CHECK-DSP-NEXT: movs r0, #47
; CHECK-DSP-NEXT: cmp r1, #1
; CHECK-DSP-NEXT: it hi
; CHECK-DSP-NEXT: movhi r0, #35
define i32 @test_ugt_1_dec_var(i8 zeroext %x, i8 zeroext %y) {
entry:
%sub = sub i8 %x, %y
%cmp = icmp ugt i8 %sub, 1
%res = select i1 %cmp, i32 35, i32 47
ret i32 %res
}
; CHECK-COMMON-LABEL: test_sge_1_dec_var
; CHECK-COMMON: sub
; CHECK-COMMON: sxtb
; CHECK-COMMON: cmp
define i32 @test_sge_1_dec_var(i8 %x, i8 %y) {
entry:
%sub = sub i8 %x, %y
%cmp = icmp sge i8 %sub, 1
%res = select i1 %cmp, i32 35, i32 47
ret i32 %res
}
; CHECK-COMMON-LABEL: dsp_imm1:
; CHECK-DSP: eors r1, r0
; CHECK-DSP-NEXT: and r0, r0, #7
; CHECK-DSP-NEXT: subs r0, r0, r1
; CHECK-DSP-NEXT: adds r0, #1
; CHECK-DSP-NEXT: uxtb r1, r0
; CHECK-DSP-NEXT: movs r0, #47
; CHECK-DSP-NEXT: cmp r1, #254
; CHECK-DSP-NEXT: it lo
; CHECK-DSP-NEXT: movlo r0, #35
; CHECK-DSP-IMM: eors r1, r0
; CHECK-DSP-IMM-NEXT: and r0, r0, #7
; CHECK-DSP-IMM-NEXT: usub8 r0, r0, r1
; CHECK-DSP-IMM-NEXT: movs r1, #1
; CHECK-DSP-IMM-NEXT: uadd8 r1, r0, r1
; CHECK-DSP-IMM-NEXT: movs r0, #47
; CHECK-DSP-IMM-NEXT: cmp r1, #254
; CHECK-DSP-IMM-NEXT: it lo
; CHECK-DSP-IMM-NEXT: movlo r0, #35
define i32 @dsp_imm1(i8 zeroext %x, i8 zeroext %y) {
entry:
%xor = xor i8 %x, %y
%and = and i8 %x, 7
%sub = sub i8 %and, %xor
%add = add i8 %sub, 1
%cmp = icmp ult i8 %add, 254
%res = select i1 %cmp, i32 35, i32 47
ret i32 %res
}
; CHECK-COMMON-LABEL: dsp_var:
; CHECK-COMMON: eors r1, r0
; CHECK-COMMON: and r2, r0, #7
; CHECK-NODSP: subs r1, r2, r1
; CHECK-NODSP: add.w r0, r1, r0, lsl #1
; CHECK-NODSP: uxtb r1, r0
; CHECK-DSP: usub8 r1, r2, r1
; CHECK-DSP: lsls r0, r0, #1
; CHECK-DSP: uadd8 r1, r1, r0
; CHECK-DSP-NOT: uxt
; CHECK-COMMON: movs r0, #47
; CHECK-COMMON: cmp r1, #254
; CHECK-COMMON: it lo
; CHECK-COMMON: movlo r0, #35
define i32 @dsp_var(i8 zeroext %x, i8 zeroext %y) {
%xor = xor i8 %x, %y
%and = and i8 %x, 7
%sub = sub i8 %and, %xor
%mul = shl nuw i8 %x, 1
%add = add i8 %sub, %mul
%cmp = icmp ult i8 %add, 254
%res = select i1 %cmp, i32 35, i32 47
ret i32 %res
}
; CHECK-COMMON-LABEL: store_dsp_res
; CHECK-DSP: usub8
; CHECK-DSP: strb
define void @store_dsp_res(i8* %in, i8* %out, i8 %compare) {
%first = getelementptr inbounds i8, i8* %in, i32 0
%second = getelementptr inbounds i8, i8* %in, i32 1
%ld0 = load i8, i8* %first
%ld1 = load i8, i8* %second
%xor = xor i8 %ld0, -1
%cmp = icmp ult i8 %compare, %ld1
%select = select i1 %cmp, i8 %compare, i8 %xor
%sub = sub i8 %ld0, %select
store i8 %sub, i8* %out, align 1
ret void
}
; CHECK-COMMON-LABEL: ugt_1_dec_imm:
; CHECK-COMMON: subs r1, r0, #1
; CHECK-COMMON-NEXT: movs r0, #47
; CHECK-COMMON-NEXT: cmp r1, #1
; CHECK-COMMON-NEXT: it hi
; CHECK-COMMON-NEXT: movhi r0, #35
define i32 @ugt_1_dec_imm(i8 zeroext %x) {
entry:
%add = add i8 %x, -1
%cmp = icmp ugt i8 %add, 1
%res = select i1 %cmp, i32 35, i32 47
ret i32 %res
}
; CHECK-COMMON-LABEL: ugt_1_dec_var:
; CHECK-NODSP: subs r0, r0, r1
; CHECK-NODSP-NEXT: uxtb r1, r0
; CHECK-NODSP-NEXT: movs r0, #47
; CHECK-NODSP-NEXT: cmp r1, #1
; CHECK-NODSP-NEXT: it hi
; CHECK-NODSP-NEXT: movhi r0, #35
; CHECK-DSP: usub8 r1, r0, r1
; CHECK-DSP-NEXT: movs r0, #47
; CHECK-DSP-NEXT: cmp r1, #1
; CHECK-DSP-NEXT: it hi
; CHECK-DSP-NEXT: movhi r0, #35
define i32 @ugt_1_dec_var(i8 zeroext %x, i8 zeroext %y) {
entry:
%sub = sub i8 %x, %y
%cmp = icmp ugt i8 %sub, 1
%res = select i1 %cmp, i32 35, i32 47
ret i32 %res
}
; CHECK-COMMON-LABEL: icmp_eq_minus_one
; CHECK-COMMON: cmp r0, #255
define i32 @icmp_eq_minus_one(i8* %ptr) {
%load = load i8, i8* %ptr, align 1
%conv = zext i8 %load to i32
%cmp = icmp eq i8 %load, -1
%ret = select i1 %cmp, i32 %conv, i32 -1
ret i32 %ret
}
; CHECK-COMMON-LABEL: icmp_not
; CHECK-COMMON: movw r2, #65535
; CHECK-COMMON: eors r2, r0
; CHECK-COMMON: movs r0, #32
; CHECK-COMMON: cmp r2, r1
define i32 @icmp_not(i16 zeroext %arg0, i16 zeroext %arg1) {
%not = xor i16 %arg0, -1
%cmp = icmp eq i16 %not, %arg1
%res = select i1 %cmp, i32 16, i32 32
ret i32 %res
}
; CHECK-COMMON-LABEL: icmp_i1
; CHECK-NOT: uxt
define i32 @icmp_i1(i1* %arg0, i1 zeroext %arg1, i32 %a, i32 %b) {
entry:
%load = load i1, i1* %arg0
%not = xor i1 %load, 1
%cmp = icmp eq i1 %arg1, %not
%res = select i1 %cmp, i32 %a, i32 %b
ret i32 %res
}
; CHECK-COMMON-LABEL: icmp_i7
; CHECK-COMMON: ldrb
; CHECK-COMMON: cmp
define i32 @icmp_i7(i7* %arg0, i7 zeroext %arg1, i32 %a, i32 %b) {
entry:
%load = load i7, i7* %arg0
%add = add nuw i7 %load, 1
%cmp = icmp ult i7 %arg1, %add
%res = select i1 %cmp, i32 %a, i32 %b
ret i32 %res
}
; CHECK-COMMON-LABEL: icmp_i15
; CHECK-COMMON: movw [[MINUS_ONE:r[0-9]+]], #32767
define i32 @icmp_i15(i15 zeroext %arg0, i15 zeroext %arg1) {
%xor = xor i15 %arg0, -1
%cmp = icmp eq i15 %xor, %arg1
%res = select i1 %cmp, i32 21, i32 42
ret i32 %res
}
; CHECK-COMMON-LABEL: icmp_minus_imm
; CHECK-NODSP: subs [[SUB:r[0-9]+]],
; CHECK-NODSP: uxtb [[UXT:r[0-9]+]],
; CHECK-NODSP: cmp [[UXT]], #251
; CHECK-DSP: subs [[SUB:r[0-9]+]],
; CHECK-DSP: uxtb [[UXT:r[0-9]+]],
; CHECK-DSP: cmp [[UXT]], #251
; CHECK-DSP-IMM: ldrb [[A:r[0-9]+]],
; CHECK-DSP-IMM: movs [[MINUS_7:r[0-9]+]], #249
; CHECK-DSP-IMM: uadd8 [[RES:r[0-9]+]], [[A]], [[MINUS_7]]
; CHECK-DSP-IMM: cmp [[RES]], #251
define i32 @icmp_minus_imm(i8* %a) {
entry:
%0 = load i8, i8* %a, align 1
%add.i = add i8 %0, -7
%cmp = icmp ugt i8 %add.i, -5
%conv1 = zext i1 %cmp to i32
ret i32 %conv1
}
; CHECK-COMMON-LABEL: mul_with_neg_imm
; CHECK-COMMON-NOT: uxtb
; CHECK-COMMON: and [[BIT0:r[0-9]+]], r0, #1
-; CHECK-COMMON: orr.w [[MUL32:r[0-9]+]], [[BIT0]], [[BIT0]], lsl #5
+; CHECK-COMMON: add.w [[MUL32:r[0-9]+]], [[BIT0]], [[BIT0]], lsl #5
; CHECK-COMMON: cmp.w r0, [[MUL32]], lsl #2
define void @mul_with_neg_imm(i32, i32* %b) {
entry:
%1 = trunc i32 %0 to i8
%2 = and i8 %1, 1
%conv.i = mul nuw i8 %2, -124
%tobool = icmp eq i8 %conv.i, 0
br i1 %tobool, label %if.end, label %if.then
if.then:
store i32 0, i32* %b, align 4
br label %if.end
if.end:
ret void
}
Index: llvm/trunk/test/CodeGen/ARM/vdup.ll
===================================================================
--- llvm/trunk/test/CodeGen/ARM/vdup.ll (revision 357115)
+++ llvm/trunk/test/CodeGen/ARM/vdup.ll (revision 357116)
@@ -1,572 +1,574 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft -mattr=+neon -verify-machineinstrs | FileCheck %s
define <8 x i8> @v_dup8(i8 %A) nounwind {
; CHECK-LABEL: v_dup8:
; CHECK: @ %bb.0:
; CHECK-NEXT: vdup.8 d16, r0
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr
%tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0
%tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1
%tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2
%tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3
%tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4
%tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5
%tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6
%tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7
ret <8 x i8> %tmp8
}
define <4 x i16> @v_dup16(i16 %A) nounwind {
; CHECK-LABEL: v_dup16:
; CHECK: @ %bb.0:
; CHECK-NEXT: vdup.16 d16, r0
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr
%tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0
%tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1
%tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2
%tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3
ret <4 x i16> %tmp4
}
define <2 x i32> @v_dup32(i32 %A) nounwind {
; CHECK-LABEL: v_dup32:
; CHECK: @ %bb.0:
; CHECK-NEXT: vdup.32 d16, r0
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr
%tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0
%tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1
ret <2 x i32> %tmp2
}
define <2 x float> @v_dupfloat(float %A) nounwind {
; CHECK-LABEL: v_dupfloat:
; CHECK: @ %bb.0:
; CHECK-NEXT: vdup.32 d16, r0
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr
%tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0
%tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1
ret <2 x float> %tmp2
}
define <16 x i8> @v_dupQ8(i8 %A) nounwind {
; CHECK-LABEL: v_dupQ8:
; CHECK: @ %bb.0:
; CHECK-NEXT: vdup.8 q8, r0
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: mov pc, lr
%tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0
%tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1
%tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2
%tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3
%tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4
%tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5
%tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6
%tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7
%tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8
%tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9
%tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10
%tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11
%tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12
%tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13
%tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14
%tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15
ret <16 x i8> %tmp16
}
define <8 x i16> @v_dupQ16(i16 %A) nounwind {
; CHECK-LABEL: v_dupQ16:
; CHECK: @ %bb.0:
; CHECK-NEXT: vdup.16 q8, r0
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: mov pc, lr
%tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0
%tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1
%tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2
%tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3
%tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4
%tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5
%tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6
%tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7
ret <8 x i16> %tmp8
}
define <4 x i32> @v_dupQ32(i32 %A) nounwind {
; CHECK-LABEL: v_dupQ32:
; CHECK: @ %bb.0:
; CHECK-NEXT: vdup.32 q8, r0
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: mov pc, lr
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0
%tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1
%tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2
%tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3
ret <4 x i32> %tmp4
}
define <4 x float> @v_dupQfloat(float %A) nounwind {
; CHECK-LABEL: v_dupQfloat:
; CHECK: @ %bb.0:
; CHECK-NEXT: vdup.32 q8, r0
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: mov pc, lr
%tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0
%tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1
%tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2
%tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3
ret <4 x float> %tmp4
}
; Check to make sure it works with shuffles, too.
define <8 x i8> @v_shuffledup8(i8 %A) nounwind {
; CHECK-LABEL: v_shuffledup8:
; CHECK: @ %bb.0:
; CHECK-NEXT: vdup.8 d16, r0
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr
%tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0
%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
ret <8 x i8> %tmp2
}
define <4 x i16> @v_shuffledup16(i16 %A) nounwind {
; CHECK-LABEL: v_shuffledup16:
; CHECK: @ %bb.0:
; CHECK-NEXT: vdup.16 d16, r0
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr
%tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0
%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
ret <4 x i16> %tmp2
}
define <2 x i32> @v_shuffledup32(i32 %A) nounwind {
; CHECK-LABEL: v_shuffledup32:
; CHECK: @ %bb.0:
; CHECK-NEXT: vdup.32 d16, r0
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr
%tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0
%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
ret <2 x i32> %tmp2
}
define <2 x float> @v_shuffledupfloat(float %A) nounwind {
; CHECK-LABEL: v_shuffledupfloat:
; CHECK: @ %bb.0:
; CHECK-NEXT: vdup.32 d16, r0
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr
%tmp1 = insertelement <2 x float> undef, float %A, i32 0
%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
ret <2 x float> %tmp2
}
define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind {
; CHECK-LABEL: v_shuffledupQ8:
; CHECK: @ %bb.0:
; CHECK-NEXT: vdup.8 q8, r0
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: mov pc, lr
%tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0
%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
ret <16 x i8> %tmp2
}
define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind {
; CHECK-LABEL: v_shuffledupQ16:
; CHECK: @ %bb.0:
; CHECK-NEXT: vdup.16 q8, r0
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: mov pc, lr
%tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0
%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
ret <8 x i16> %tmp2
}
define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind {
; CHECK-LABEL: v_shuffledupQ32:
; CHECK: @ %bb.0:
; CHECK-NEXT: vdup.32 q8, r0
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: mov pc, lr
%tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0
%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
ret <4 x i32> %tmp2
}
define <4 x float> @v_shuffledupQfloat(float %A) nounwind {
; CHECK-LABEL: v_shuffledupQfloat:
; CHECK: @ %bb.0:
; CHECK-NEXT: vdup.32 q8, r0
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: mov pc, lr
%tmp1 = insertelement <4 x float> undef, float %A, i32 0
%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
ret <4 x float> %tmp2
}
define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind {
; CHECK-LABEL: vduplane8:
; CHECK: @ %bb.0:
; CHECK-NEXT: vldr d16, [r0]
; CHECK-NEXT: vdup.8 d16, d16[1]
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr
%tmp1 = load <8 x i8>, <8 x i8>* %A
%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
ret <8 x i8> %tmp2
}
define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind {
; CHECK-LABEL: vduplane16:
; CHECK: @ %bb.0:
; CHECK-NEXT: vldr d16, [r0]
; CHECK-NEXT: vdup.16 d16, d16[1]
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr
%tmp1 = load <4 x i16>, <4 x i16>* %A
%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
ret <4 x i16> %tmp2
}
define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind {
; CHECK-LABEL: vduplane32:
; CHECK: @ %bb.0:
; CHECK-NEXT: vldr d16, [r0]
; CHECK-NEXT: vdup.32 d16, d16[1]
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr
%tmp1 = load <2 x i32>, <2 x i32>* %A
%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
ret <2 x i32> %tmp2
}
define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind {
; CHECK-LABEL: vduplanefloat:
; CHECK: @ %bb.0:
; CHECK-NEXT: vldr d16, [r0]
; CHECK-NEXT: vdup.32 d16, d16[1]
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr
%tmp1 = load <2 x float>, <2 x float>* %A
%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
ret <2 x float> %tmp2
}
define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind {
; CHECK-LABEL: vduplaneQ8:
; CHECK: @ %bb.0:
; CHECK-NEXT: vldr d16, [r0]
; CHECK-NEXT: vdup.8 q8, d16[1]
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: mov pc, lr
%tmp1 = load <8 x i8>, <8 x i8>* %A
%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
ret <16 x i8> %tmp2
}
define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind {
; CHECK-LABEL: vduplaneQ16:
; CHECK: @ %bb.0:
; CHECK-NEXT: vldr d16, [r0]
; CHECK-NEXT: vdup.16 q8, d16[1]
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: mov pc, lr
%tmp1 = load <4 x i16>, <4 x i16>* %A
%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
ret <8 x i16> %tmp2
}
define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind {
; CHECK-LABEL: vduplaneQ32:
; CHECK: @ %bb.0:
; CHECK-NEXT: vldr d16, [r0]
; CHECK-NEXT: vdup.32 q8, d16[1]
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: mov pc, lr
%tmp1 = load <2 x i32>, <2 x i32>* %A
%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
ret <4 x i32> %tmp2
}
define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind {
; CHECK-LABEL: vduplaneQfloat:
; CHECK: @ %bb.0:
; CHECK-NEXT: vldr d16, [r0]
; CHECK-NEXT: vdup.32 q8, d16[1]
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: mov pc, lr
%tmp1 = load <2 x float>, <2 x float>* %A
%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
ret <4 x float> %tmp2
}
define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone {
; CHECK-LABEL: foo:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: mov r1, r3
; CHECK-NEXT: mov pc, lr
entry:
%0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
ret <2 x i64> %0
}
define <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone {
; CHECK-LABEL: bar:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: mov r3, r1
; CHECK-NEXT: mov pc, lr
entry:
%0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
ret <2 x i64> %0
}
define <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone {
; CHECK-LABEL: baz:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: mov r1, r3
; CHECK-NEXT: mov pc, lr
entry:
%0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1>
ret <2 x double> %0
}
define <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone {
; CHECK-LABEL: qux:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: mov r3, r1
; CHECK-NEXT: mov pc, lr
entry:
%0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0>
ret <2 x double> %0
}
; Radar 7373643
define void @redundantVdup(<8 x i8>* %ptr) nounwind {
; CHECK-LABEL: redundantVdup:
; CHECK: @ %bb.0:
; CHECK-NEXT: vmov.i8 d16, #0x80
; CHECK-NEXT: vstr d16, [r0]
; CHECK-NEXT: mov pc, lr
%1 = insertelement <8 x i8> undef, i8 -128, i32 0
%2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
store <8 x i8> %2, <8 x i8>* %ptr, align 8
ret void
}
define <4 x i32> @tdupi(i32 %x, i32 %y) {
; CHECK-LABEL: tdupi:
; CHECK: @ %bb.0:
; CHECK-NEXT: vdup.32 q8, r0
; CHECK-NEXT: vmov.32 d17[1], r1
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: mov pc, lr
%1 = insertelement <4 x i32> undef, i32 %x, i32 0
%2 = insertelement <4 x i32> %1, i32 %x, i32 1
%3 = insertelement <4 x i32> %2, i32 %x, i32 2
%4 = insertelement <4 x i32> %3, i32 %y, i32 3
ret <4 x i32> %4
}
define <4 x float> @tdupf(float %x, float %y) {
; CHECK-LABEL: tdupf:
; CHECK: @ %bb.0:
; CHECK-NEXT: vdup.32 q0, r0
; CHECK-NEXT: vmov s3, r1
; CHECK-NEXT: vmov r0, r1, d0
; CHECK-NEXT: vmov r2, r3, d1
; CHECK-NEXT: mov pc, lr
%1 = insertelement <4 x float> undef, float %x, i32 0
%2 = insertelement <4 x float> %1, float %x, i32 1
%3 = insertelement <4 x float> %2, float %x, i32 2
%4 = insertelement <4 x float> %3, float %y, i32 3
ret <4 x float> %4
}
; This test checks that when splatting an element from a vector into another,
; the value isn't moved out to GPRs first.
define <4 x i32> @tduplane(<4 x i32> %invec) {
; CHECK-LABEL: tduplane:
; CHECK: @ %bb.0:
; CHECK-NEXT: vmov d16, r0, r1
; CHECK-NEXT: mov r0, #255
; CHECK-NEXT: vdup.32 q8, d16[1]
; CHECK-NEXT: vmov.32 d17[1], r0
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: mov pc, lr
%in = extractelement <4 x i32> %invec, i32 1
%1 = insertelement <4 x i32> undef, i32 %in, i32 0
%2 = insertelement <4 x i32> %1, i32 %in, i32 1
%3 = insertelement <4 x i32> %2, i32 %in, i32 2
%4 = insertelement <4 x i32> %3, i32 255, i32 3
ret <4 x i32> %4
}
define <2 x float> @check_f32(<4 x float> %v) nounwind {
; CHECK-LABEL: check_f32:
; CHECK: @ %bb.0:
; CHECK-NEXT: vmov d17, r2, r3
+; CHECK-NEXT: vmov d16, r0, r1
; CHECK-NEXT: vdup.32 d16, d17[1]
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr
%x = extractelement <4 x float> %v, i32 3
%1 = insertelement <2 x float> undef, float %x, i32 0
%2 = insertelement <2 x float> %1, float %x, i32 1
ret <2 x float> %2
}
define <2 x i32> @check_i32(<4 x i32> %v) nounwind {
; CHECK-LABEL: check_i32:
; CHECK: @ %bb.0:
; CHECK-NEXT: vmov d17, r2, r3
+; CHECK-NEXT: vmov d16, r0, r1
; CHECK-NEXT: vdup.32 d16, d17[1]
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr
%x = extractelement <4 x i32> %v, i32 3
%1 = insertelement <2 x i32> undef, i32 %x, i32 0
%2 = insertelement <2 x i32> %1, i32 %x, i32 1
ret <2 x i32> %2
}
define <4 x i16> @check_i16(<8 x i16> %v) nounwind {
; CHECK-LABEL: check_i16:
; CHECK: @ %bb.0:
; CHECK-NEXT: vmov d17, r2, r3
; CHECK-NEXT: vmov d16, r0, r1
; CHECK-NEXT: vdup.16 d16, d16[3]
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr
%x = extractelement <8 x i16> %v, i32 3
%1 = insertelement <4 x i16> undef, i16 %x, i32 0
%2 = insertelement <4 x i16> %1, i16 %x, i32 1
ret <4 x i16> %2
}
define <8 x i8> @check_i8(<16 x i8> %v) nounwind {
; CHECK-LABEL: check_i8:
; CHECK: @ %bb.0:
; CHECK-NEXT: vmov d17, r2, r3
; CHECK-NEXT: vmov d16, r0, r1
; CHECK-NEXT: vdup.8 d16, d16[3]
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr
%x = extractelement <16 x i8> %v, i32 3
%1 = insertelement <8 x i8> undef, i8 %x, i32 0
%2 = insertelement <8 x i8> %1, i8 %x, i32 1
ret <8 x i8> %2
}
; Check that an SPR splat produces a vdup.
define <2 x float> @check_spr_splat2(<2 x float> %p, i16 %q) {
; CHECK-LABEL: check_spr_splat2:
; CHECK: @ %bb.0:
; CHECK-NEXT: lsl r2, r2, #16
; CHECK-NEXT: vmov d16, r0, r1
; CHECK-NEXT: asr r2, r2, #16
; CHECK-NEXT: vmov s0, r2
; CHECK-NEXT: vcvt.f32.s32 s0, s0
; CHECK-NEXT: vdup.32 d17, d0[0]
; CHECK-NEXT: vsub.f32 d16, d17, d16
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr
%conv = sitofp i16 %q to float
%splat.splatinsert = insertelement <2 x float> undef, float %conv, i32 0
%splat.splat = shufflevector <2 x float> %splat.splatinsert, <2 x float> undef, <2 x i32> zeroinitializer
%sub = fsub <2 x float> %splat.splat, %p
ret <2 x float> %sub
}
define <4 x float> @check_spr_splat4(<4 x float> %p, i16 %q) {
; CHECK-LABEL: check_spr_splat4:
; CHECK: @ %bb.0:
; CHECK-NEXT: ldrsh r12, [sp]
; CHECK-NEXT: vmov d17, r2, r3
; CHECK-NEXT: vmov d16, r0, r1
; CHECK-NEXT: vmov s0, r12
; CHECK-NEXT: vcvt.f32.s32 s0, s0
; CHECK-NEXT: vdup.32 q9, d0[0]
; CHECK-NEXT: vsub.f32 q8, q9, q8
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: mov pc, lr
%conv = sitofp i16 %q to float
%splat.splatinsert = insertelement <4 x float> undef, float %conv, i32 0
%splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
%sub = fsub <4 x float> %splat.splat, %p
ret <4 x float> %sub
}
; Same codegen as above test; scalar is splatted using vld1, so shuffle index is irrelevant.
define <4 x float> @check_spr_splat4_lane1(<4 x float> %p, i16 %q) {
; CHECK-LABEL: check_spr_splat4_lane1:
; CHECK: @ %bb.0:
; CHECK-NEXT: ldrsh r12, [sp]
; CHECK-NEXT: vmov d17, r2, r3
; CHECK-NEXT: vmov d16, r0, r1
; CHECK-NEXT: vmov s0, r12
; CHECK-NEXT: vcvt.f32.s32 s0, s0
; CHECK-NEXT: vdup.32 q9, d0[0]
; CHECK-NEXT: vsub.f32 q8, q9, q8
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: mov pc, lr
%conv = sitofp i16 %q to float
%splat.splatinsert = insertelement <4 x float> undef, float %conv, i32 1
%splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%sub = fsub <4 x float> %splat.splat, %p
ret <4 x float> %sub
}
; Also make sure we don't barf on variable-index extractelts, where we almost
; could have generated a vdup.
define <8 x i8> @check_i8_varidx(<16 x i8> %v, i32 %idx) {
; CHECK-LABEL: check_i8_varidx:
; CHECK: @ %bb.0:
; CHECK-NEXT: .save {r11}
; CHECK-NEXT: push {r11}
; CHECK-NEXT: .setfp r11, sp
; CHECK-NEXT: mov r11, sp
; CHECK-NEXT: .pad #28
; CHECK-NEXT: sub sp, sp, #28
; CHECK-NEXT: bic sp, sp, #15
; CHECK-NEXT: ldr r12, [r11, #4]
; CHECK-NEXT: vmov d17, r2, r3
; CHECK-NEXT: vmov d16, r0, r1
; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: and r0, r12, #15
; CHECK-NEXT: vst1.64 {d16, d17}, [r1:128], r0
; CHECK-NEXT: vld1.8 {d16[]}, [r1]
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov sp, r11
; CHECK-NEXT: pop {r11}
; CHECK-NEXT: mov pc, lr
%x = extractelement <16 x i8> %v, i32 %idx
%1 = insertelement <8 x i8> undef, i8 %x, i32 0
%2 = insertelement <8 x i8> %1, i8 %x, i32 1
ret <8 x i8> %2
}
Index: llvm/trunk/test/CodeGen/PowerPC/vec_conv_i8_to_fp64_elts.ll
===================================================================
--- llvm/trunk/test/CodeGen/PowerPC/vec_conv_i8_to_fp64_elts.ll (revision 357115)
+++ llvm/trunk/test/CodeGen/PowerPC/vec_conv_i8_to_fp64_elts.ll (revision 357116)
@@ -1,859 +1,859 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
; RUN: -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
; RUN: FileCheck %s --check-prefix=CHECK-P8
; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
; RUN: FileCheck %s --check-prefix=CHECK-P9
; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
; RUN: FileCheck %s --check-prefix=CHECK-BE
define <2 x double> @test2elt(i16 %a.coerce) local_unnamed_addr #0 {
; CHECK-P8-LABEL: test2elt:
; CHECK-P8: # %bb.0: # %entry
; CHECK-P8-NEXT: addis r4, r2, .LCPI0_0@toc@ha
; CHECK-P8-NEXT: mtvsrd f0, r3
; CHECK-P8-NEXT: addi r3, r4, .LCPI0_0@toc@l
; CHECK-P8-NEXT: xxlxor v4, v4, v4
; CHECK-P8-NEXT: xxswapd v2, vs0
; CHECK-P8-NEXT: lvx v3, 0, r3
; CHECK-P8-NEXT: vperm v2, v4, v2, v3
; CHECK-P8-NEXT: xvcvuxddp v2, v2
; CHECK-P8-NEXT: blr
;
; CHECK-P9-LABEL: test2elt:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: mtvsrws v2, r3
; CHECK-P9-NEXT: addis r3, r2, .LCPI0_0@toc@ha
; CHECK-P9-NEXT: addi r3, r3, .LCPI0_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r3
; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: vperm v2, v4, v2, v3
; CHECK-P9-NEXT: xvcvuxddp v2, v2
; CHECK-P9-NEXT: blr
;
; CHECK-BE-LABEL: test2elt:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: mtvsrws v2, r3
; CHECK-BE-NEXT: addis r3, r2, .LCPI0_0@toc@ha
; CHECK-BE-NEXT: addi r3, r3, .LCPI0_0@toc@l
; CHECK-BE-NEXT: lxvx v3, 0, r3
; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: vperm v2, v2, v4, v3
; CHECK-BE-NEXT: xvcvuxddp v2, v2
; CHECK-BE-NEXT: blr
entry:
%0 = bitcast i16 %a.coerce to <2 x i8>
%1 = uitofp <2 x i8> %0 to <2 x double>
ret <2 x double> %1
}
define void @test4elt(<4 x double>* noalias nocapture sret %agg.result, i32 %a.coerce) local_unnamed_addr #1 {
; CHECK-P8-LABEL: test4elt:
; CHECK-P8: # %bb.0: # %entry
; CHECK-P8-NEXT: addis r5, r2, .LCPI1_0@toc@ha
; CHECK-P8-NEXT: mtvsrd f0, r4
; CHECK-P8-NEXT: addis r4, r2, .LCPI1_1@toc@ha
; CHECK-P8-NEXT: addi r5, r5, .LCPI1_0@toc@l
; CHECK-P8-NEXT: addi r4, r4, .LCPI1_1@toc@l
; CHECK-P8-NEXT: xxlxor v4, v4, v4
; CHECK-P8-NEXT: lvx v2, 0, r5
; CHECK-P8-NEXT: xxswapd v3, vs0
; CHECK-P8-NEXT: lvx v5, 0, r4
; CHECK-P8-NEXT: li r4, 16
; CHECK-P8-NEXT: vperm v2, v4, v3, v2
; CHECK-P8-NEXT: vperm v3, v4, v3, v5
; CHECK-P8-NEXT: xvcvuxddp vs0, v2
; CHECK-P8-NEXT: xvcvuxddp vs1, v3
; CHECK-P8-NEXT: xxswapd vs0, vs0
; CHECK-P8-NEXT: xxswapd vs1, vs1
; CHECK-P8-NEXT: stxvd2x vs1, r3, r4
; CHECK-P8-NEXT: stxvd2x vs0, 0, r3
; CHECK-P8-NEXT: blr
;
; CHECK-P9-LABEL: test4elt:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: mtvsrws v2, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI1_0@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI1_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: addis r4, r2, .LCPI1_1@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI1_1@toc@l
; CHECK-P9-NEXT: vperm v3, v4, v2, v3
; CHECK-P9-NEXT: xvcvuxddp vs0, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: vperm v2, v4, v2, v3
; CHECK-P9-NEXT: stxv vs0, 0(r3)
; CHECK-P9-NEXT: xvcvuxddp vs1, v2
; CHECK-P9-NEXT: stxv vs1, 16(r3)
; CHECK-P9-NEXT: blr
;
; CHECK-BE-LABEL: test4elt:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: mtvsrws v2, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI1_0@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI1_0@toc@l
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: addis r4, r2, .LCPI1_1@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI1_1@toc@l
; CHECK-BE-NEXT: vperm v3, v2, v4, v3
; CHECK-BE-NEXT: xvcvuxddp vs0, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: vperm v2, v4, v2, v3
; CHECK-BE-NEXT: stxv vs0, 0(r3)
; CHECK-BE-NEXT: xvcvuxddp vs1, v2
; CHECK-BE-NEXT: stxv vs1, 16(r3)
; CHECK-BE-NEXT: blr
entry:
%0 = bitcast i32 %a.coerce to <4 x i8>
%1 = uitofp <4 x i8> %0 to <4 x double>
store <4 x double> %1, <4 x double>* %agg.result, align 32
ret void
}
define void @test8elt(<8 x double>* noalias nocapture sret %agg.result, i64 %a.coerce) local_unnamed_addr #1 {
; CHECK-P8-LABEL: test8elt:
; CHECK-P8: # %bb.0: # %entry
; CHECK-P8-NEXT: addis r5, r2, .LCPI2_0@toc@ha
; CHECK-P8-NEXT: mtvsrd f0, r4
; CHECK-P8-NEXT: addis r4, r2, .LCPI2_2@toc@ha
; CHECK-P8-NEXT: addi r5, r5, .LCPI2_0@toc@l
; CHECK-P8-NEXT: addi r4, r4, .LCPI2_2@toc@l
; CHECK-P8-NEXT: xxlxor v4, v4, v4
; CHECK-P8-NEXT: lvx v2, 0, r5
; CHECK-P8-NEXT: addis r5, r2, .LCPI2_3@toc@ha
; CHECK-P8-NEXT: lvx v5, 0, r4
; CHECK-P8-NEXT: addis r4, r2, .LCPI2_1@toc@ha
; CHECK-P8-NEXT: xxswapd v3, vs0
; CHECK-P8-NEXT: addi r5, r5, .LCPI2_3@toc@l
; CHECK-P8-NEXT: addi r4, r4, .LCPI2_1@toc@l
; CHECK-P8-NEXT: lvx v0, 0, r5
; CHECK-P8-NEXT: lvx v1, 0, r4
; CHECK-P8-NEXT: li r4, 48
; CHECK-P8-NEXT: li r5, 32
; CHECK-P8-NEXT: vperm v2, v4, v3, v2
; CHECK-P8-NEXT: vperm v5, v4, v3, v5
; CHECK-P8-NEXT: vperm v0, v4, v3, v0
; CHECK-P8-NEXT: vperm v3, v4, v3, v1
; CHECK-P8-NEXT: xvcvuxddp vs0, v2
; CHECK-P8-NEXT: xvcvuxddp vs1, v5
; CHECK-P8-NEXT: xvcvuxddp vs2, v0
; CHECK-P8-NEXT: xvcvuxddp vs3, v3
; CHECK-P8-NEXT: xxswapd vs0, vs0
; CHECK-P8-NEXT: xxswapd vs1, vs1
; CHECK-P8-NEXT: xxswapd vs2, vs2
; CHECK-P8-NEXT: xxswapd vs3, vs3
; CHECK-P8-NEXT: stxvd2x vs2, r3, r4
; CHECK-P8-NEXT: li r4, 16
; CHECK-P8-NEXT: stxvd2x vs1, r3, r5
; CHECK-P8-NEXT: stxvd2x vs3, r3, r4
; CHECK-P8-NEXT: stxvd2x vs0, 0, r3
; CHECK-P8-NEXT: blr
;
; CHECK-P9-LABEL: test8elt:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: mtvsrd f0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI2_0@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI2_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: xxswapd v2, vs0
; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: addis r4, r2, .LCPI2_1@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI2_1@toc@l
; CHECK-P9-NEXT: vperm v3, v4, v2, v3
; CHECK-P9-NEXT: xvcvuxddp vs0, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI2_2@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI2_2@toc@l
; CHECK-P9-NEXT: vperm v3, v4, v2, v3
; CHECK-P9-NEXT: stxv vs0, 0(r3)
; CHECK-P9-NEXT: xvcvuxddp vs1, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI2_3@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI2_3@toc@l
; CHECK-P9-NEXT: vperm v3, v4, v2, v3
; CHECK-P9-NEXT: stxv vs1, 16(r3)
; CHECK-P9-NEXT: xvcvuxddp vs2, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: vperm v2, v4, v2, v3
; CHECK-P9-NEXT: stxv vs2, 32(r3)
; CHECK-P9-NEXT: xvcvuxddp vs3, v2
; CHECK-P9-NEXT: stxv vs3, 48(r3)
; CHECK-P9-NEXT: blr
;
; CHECK-BE-LABEL: test8elt:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: mtvsrd v2, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI2_0@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI2_0@toc@l
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: addis r4, r2, .LCPI2_1@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI2_1@toc@l
; CHECK-BE-NEXT: vperm v3, v2, v4, v3
; CHECK-BE-NEXT: xvcvuxddp vs0, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI2_2@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI2_2@toc@l
; CHECK-BE-NEXT: vperm v3, v4, v2, v3
; CHECK-BE-NEXT: stxv vs0, 0(r3)
; CHECK-BE-NEXT: xvcvuxddp vs1, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI2_3@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI2_3@toc@l
; CHECK-BE-NEXT: vperm v3, v4, v2, v3
; CHECK-BE-NEXT: stxv vs1, 16(r3)
; CHECK-BE-NEXT: xvcvuxddp vs2, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: vperm v2, v4, v2, v3
; CHECK-BE-NEXT: stxv vs2, 32(r3)
; CHECK-BE-NEXT: xvcvuxddp vs3, v2
; CHECK-BE-NEXT: stxv vs3, 48(r3)
; CHECK-BE-NEXT: blr
entry:
%0 = bitcast i64 %a.coerce to <8 x i8>
%1 = uitofp <8 x i8> %0 to <8 x double>
store <8 x double> %1, <8 x double>* %agg.result, align 64
ret void
}
define void @test16elt(<16 x double>* noalias nocapture sret %agg.result, <16 x i8> %a) local_unnamed_addr #2 {
; CHECK-P8-LABEL: test16elt:
; CHECK-P8: # %bb.0: # %entry
; CHECK-P8-NEXT: addis r4, r2, .LCPI3_0@toc@ha
; CHECK-P8-NEXT: addis r5, r2, .LCPI3_1@toc@ha
; CHECK-P8-NEXT: xxlxor v4, v4, v4
; CHECK-P8-NEXT: addi r4, r4, .LCPI3_0@toc@l
; CHECK-P8-NEXT: addi r5, r5, .LCPI3_1@toc@l
; CHECK-P8-NEXT: lvx v3, 0, r4
; CHECK-P8-NEXT: addis r4, r2, .LCPI3_2@toc@ha
; CHECK-P8-NEXT: lvx v5, 0, r5
; CHECK-P8-NEXT: addis r5, r2, .LCPI3_4@toc@ha
; CHECK-P8-NEXT: addi r4, r4, .LCPI3_2@toc@l
; CHECK-P8-NEXT: addi r5, r5, .LCPI3_4@toc@l
; CHECK-P8-NEXT: lvx v0, 0, r4
; CHECK-P8-NEXT: addis r4, r2, .LCPI3_6@toc@ha
; CHECK-P8-NEXT: lvx v1, 0, r5
; CHECK-P8-NEXT: addis r5, r2, .LCPI3_7@toc@ha
; CHECK-P8-NEXT: addi r4, r4, .LCPI3_6@toc@l
; CHECK-P8-NEXT: addi r5, r5, .LCPI3_7@toc@l
; CHECK-P8-NEXT: vperm v3, v4, v2, v3
; CHECK-P8-NEXT: lvx v6, 0, r4
; CHECK-P8-NEXT: addis r4, r2, .LCPI3_5@toc@ha
; CHECK-P8-NEXT: lvx v7, 0, r5
; CHECK-P8-NEXT: addis r5, r2, .LCPI3_3@toc@ha
; CHECK-P8-NEXT: vperm v5, v4, v2, v5
; CHECK-P8-NEXT: addi r4, r4, .LCPI3_5@toc@l
; CHECK-P8-NEXT: addi r5, r5, .LCPI3_3@toc@l
; CHECK-P8-NEXT: vperm v0, v4, v2, v0
; CHECK-P8-NEXT: lvx v8, 0, r4
; CHECK-P8-NEXT: lvx v9, 0, r5
; CHECK-P8-NEXT: vperm v1, v4, v2, v1
; CHECK-P8-NEXT: li r4, 112
; CHECK-P8-NEXT: li r5, 96
; CHECK-P8-NEXT: vperm v6, v4, v2, v6
; CHECK-P8-NEXT: vperm v7, v4, v2, v7
; CHECK-P8-NEXT: vperm v8, v4, v2, v8
; CHECK-P8-NEXT: vperm v2, v4, v2, v9
; CHECK-P8-NEXT: xvcvuxddp vs0, v0
; CHECK-P8-NEXT: xvcvuxddp vs1, v1
; CHECK-P8-NEXT: xvcvuxddp vs2, v6
; CHECK-P8-NEXT: xvcvuxddp vs3, v7
; CHECK-P8-NEXT: xvcvuxddp vs4, v8
; CHECK-P8-NEXT: xvcvuxddp vs5, v2
; CHECK-P8-NEXT: xvcvuxddp vs6, v3
; CHECK-P8-NEXT: xxswapd vs0, vs0
; CHECK-P8-NEXT: xvcvuxddp vs7, v5
; CHECK-P8-NEXT: xxswapd vs1, vs1
; CHECK-P8-NEXT: xxswapd vs2, vs2
; CHECK-P8-NEXT: xxswapd vs3, vs3
; CHECK-P8-NEXT: xxswapd vs4, vs4
; CHECK-P8-NEXT: xxswapd vs5, vs5
; CHECK-P8-NEXT: stxvd2x vs3, r3, r4
; CHECK-P8-NEXT: stxvd2x vs2, r3, r5
; CHECK-P8-NEXT: li r4, 80
; CHECK-P8-NEXT: li r5, 64
; CHECK-P8-NEXT: xxswapd vs2, vs7
; CHECK-P8-NEXT: xxswapd vs3, vs6
; CHECK-P8-NEXT: stxvd2x vs4, r3, r4
; CHECK-P8-NEXT: li r4, 48
; CHECK-P8-NEXT: stxvd2x vs1, r3, r5
; CHECK-P8-NEXT: li r5, 32
; CHECK-P8-NEXT: stxvd2x vs5, r3, r4
; CHECK-P8-NEXT: li r4, 16
; CHECK-P8-NEXT: stxvd2x vs0, r3, r5
; CHECK-P8-NEXT: stxvd2x vs2, r3, r4
; CHECK-P8-NEXT: stxvd2x vs3, 0, r3
; CHECK-P8-NEXT: blr
;
; CHECK-P9-LABEL: test16elt:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: addis r4, r2, .LCPI3_0@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI3_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: addis r4, r2, .LCPI3_1@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI3_1@toc@l
; CHECK-P9-NEXT: vperm v3, v4, v2, v3
; CHECK-P9-NEXT: xvcvuxddp vs0, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI3_2@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI3_2@toc@l
; CHECK-P9-NEXT: vperm v3, v4, v2, v3
; CHECK-P9-NEXT: stxv vs0, 0(r3)
; CHECK-P9-NEXT: xvcvuxddp vs1, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI3_3@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI3_3@toc@l
; CHECK-P9-NEXT: vperm v3, v4, v2, v3
; CHECK-P9-NEXT: stxv vs1, 16(r3)
; CHECK-P9-NEXT: xvcvuxddp vs2, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI3_4@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI3_4@toc@l
; CHECK-P9-NEXT: vperm v3, v4, v2, v3
; CHECK-P9-NEXT: stxv vs2, 32(r3)
; CHECK-P9-NEXT: xvcvuxddp vs3, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI3_5@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI3_5@toc@l
; CHECK-P9-NEXT: vperm v3, v4, v2, v3
; CHECK-P9-NEXT: stxv vs3, 48(r3)
; CHECK-P9-NEXT: xvcvuxddp vs4, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI3_6@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI3_6@toc@l
; CHECK-P9-NEXT: vperm v3, v4, v2, v3
; CHECK-P9-NEXT: stxv vs4, 64(r3)
; CHECK-P9-NEXT: xvcvuxddp vs5, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI3_7@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI3_7@toc@l
; CHECK-P9-NEXT: vperm v3, v4, v2, v3
; CHECK-P9-NEXT: stxv vs5, 80(r3)
; CHECK-P9-NEXT: xvcvuxddp vs6, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: vperm v2, v4, v2, v3
; CHECK-P9-NEXT: stxv vs6, 96(r3)
; CHECK-P9-NEXT: xvcvuxddp vs7, v2
; CHECK-P9-NEXT: stxv vs7, 112(r3)
; CHECK-P9-NEXT: blr
;
; CHECK-BE-LABEL: test16elt:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: addis r4, r2, .LCPI3_0@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI3_0@toc@l
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: addis r4, r2, .LCPI3_1@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI3_1@toc@l
; CHECK-BE-NEXT: vperm v3, v2, v4, v3
; CHECK-BE-NEXT: xvcvuxddp vs0, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI3_2@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI3_2@toc@l
; CHECK-BE-NEXT: vperm v3, v4, v2, v3
; CHECK-BE-NEXT: stxv vs0, 0(r3)
; CHECK-BE-NEXT: xvcvuxddp vs1, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI3_3@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI3_3@toc@l
; CHECK-BE-NEXT: vperm v3, v4, v2, v3
; CHECK-BE-NEXT: stxv vs1, 16(r3)
; CHECK-BE-NEXT: xvcvuxddp vs2, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI3_4@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI3_4@toc@l
; CHECK-BE-NEXT: vperm v3, v4, v2, v3
; CHECK-BE-NEXT: stxv vs2, 32(r3)
; CHECK-BE-NEXT: xvcvuxddp vs3, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI3_5@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI3_5@toc@l
; CHECK-BE-NEXT: vperm v3, v4, v2, v3
; CHECK-BE-NEXT: stxv vs3, 48(r3)
; CHECK-BE-NEXT: xvcvuxddp vs4, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI3_6@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI3_6@toc@l
; CHECK-BE-NEXT: vperm v3, v4, v2, v3
; CHECK-BE-NEXT: stxv vs4, 64(r3)
; CHECK-BE-NEXT: xvcvuxddp vs5, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI3_7@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI3_7@toc@l
; CHECK-BE-NEXT: vperm v3, v4, v2, v3
; CHECK-BE-NEXT: stxv vs5, 80(r3)
; CHECK-BE-NEXT: xvcvuxddp vs6, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: vperm v2, v4, v2, v3
; CHECK-BE-NEXT: stxv vs6, 96(r3)
; CHECK-BE-NEXT: xvcvuxddp vs7, v2
; CHECK-BE-NEXT: stxv vs7, 112(r3)
; CHECK-BE-NEXT: blr
entry:
%0 = uitofp <16 x i8> %a to <16 x double>
store <16 x double> %0, <16 x double>* %agg.result, align 128
ret void
}
define <2 x double> @test2elt_signed(i16 %a.coerce) local_unnamed_addr #0 {
; CHECK-P8-LABEL: test2elt_signed:
; CHECK-P8: # %bb.0: # %entry
; CHECK-P8-NEXT: addis r4, r2, .LCPI4_0@toc@ha
; CHECK-P8-NEXT: mtvsrd f0, r3
; CHECK-P8-NEXT: addi r3, r4, .LCPI4_0@toc@l
; CHECK-P8-NEXT: xxswapd v2, vs0
; CHECK-P8-NEXT: lvx v3, 0, r3
; CHECK-P8-NEXT: addis r3, r2, .LCPI4_1@toc@ha
; CHECK-P8-NEXT: addi r3, r3, .LCPI4_1@toc@l
; CHECK-P8-NEXT: lxvd2x vs0, 0, r3
; CHECK-P8-NEXT: vperm v2, v2, v2, v3
; CHECK-P8-NEXT: xxswapd v3, vs0
; CHECK-P8-NEXT: vsld v2, v2, v3
; CHECK-P8-NEXT: vsrad v2, v2, v3
; CHECK-P8-NEXT: xvcvsxddp v2, v2
; CHECK-P8-NEXT: blr
;
; CHECK-P9-LABEL: test2elt_signed:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: mtvsrws v2, r3
; CHECK-P9-NEXT: addis r3, r2, .LCPI4_0@toc@ha
; CHECK-P9-NEXT: addi r3, r3, .LCPI4_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r3
; CHECK-P9-NEXT: vperm v2, v2, v2, v3
; CHECK-P9-NEXT: vextsb2d v2, v2
; CHECK-P9-NEXT: xvcvsxddp v2, v2
; CHECK-P9-NEXT: blr
;
; CHECK-BE-LABEL: test2elt_signed:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: mtvsrws v2, r3
; CHECK-BE-NEXT: addis r3, r2, .LCPI4_0@toc@ha
; CHECK-BE-NEXT: addi r3, r3, .LCPI4_0@toc@l
; CHECK-BE-NEXT: lxvx v3, 0, r3
; CHECK-BE-NEXT: vperm v2, v2, v2, v3
; CHECK-BE-NEXT: vextsb2d v2, v2
; CHECK-BE-NEXT: xvcvsxddp v2, v2
; CHECK-BE-NEXT: blr
entry:
%0 = bitcast i16 %a.coerce to <2 x i8>
%1 = sitofp <2 x i8> %0 to <2 x double>
ret <2 x double> %1
}
define void @test4elt_signed(<4 x double>* noalias nocapture sret %agg.result, i32 %a.coerce) local_unnamed_addr #1 {
; CHECK-P8-LABEL: test4elt_signed:
; CHECK-P8: # %bb.0: # %entry
; CHECK-P8-NEXT: addis r5, r2, .LCPI5_0@toc@ha
; CHECK-P8-NEXT: mtvsrd f0, r4
; CHECK-P8-NEXT: addis r4, r2, .LCPI5_2@toc@ha
; CHECK-P8-NEXT: addi r5, r5, .LCPI5_0@toc@l
; CHECK-P8-NEXT: addi r4, r4, .LCPI5_2@toc@l
; CHECK-P8-NEXT: lvx v2, 0, r5
; CHECK-P8-NEXT: xxswapd v3, vs0
; CHECK-P8-NEXT: lvx v4, 0, r4
; CHECK-P8-NEXT: addis r4, r2, .LCPI5_1@toc@ha
; CHECK-P8-NEXT: addi r4, r4, .LCPI5_1@toc@l
; CHECK-P8-NEXT: lxvd2x vs0, 0, r4
; CHECK-P8-NEXT: li r4, 16
; CHECK-P8-NEXT: vperm v2, v3, v3, v2
; CHECK-P8-NEXT: vperm v3, v3, v3, v4
; CHECK-P8-NEXT: xxswapd v4, vs0
; CHECK-P8-NEXT: vsld v2, v2, v4
; CHECK-P8-NEXT: vsld v3, v3, v4
; CHECK-P8-NEXT: vsrad v2, v2, v4
; CHECK-P8-NEXT: vsrad v3, v3, v4
; CHECK-P8-NEXT: xvcvsxddp vs0, v2
; CHECK-P8-NEXT: xvcvsxddp vs1, v3
; CHECK-P8-NEXT: xxswapd vs0, vs0
; CHECK-P8-NEXT: xxswapd vs1, vs1
; CHECK-P8-NEXT: stxvd2x vs1, r3, r4
; CHECK-P8-NEXT: stxvd2x vs0, 0, r3
; CHECK-P8-NEXT: blr
;
; CHECK-P9-LABEL: test4elt_signed:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: mtvsrws v2, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI5_0@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI5_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI5_1@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI5_1@toc@l
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: vextsb2d v3, v3
; CHECK-P9-NEXT: xvcvsxddp vs0, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: vperm v2, v2, v2, v3
; CHECK-P9-NEXT: stxv vs0, 0(r3)
; CHECK-P9-NEXT: vextsb2d v2, v2
; CHECK-P9-NEXT: xvcvsxddp vs1, v2
; CHECK-P9-NEXT: stxv vs1, 16(r3)
; CHECK-P9-NEXT: blr
;
; CHECK-BE-LABEL: test4elt_signed:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: mtvsrws v2, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI5_0@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI5_0@toc@l
; CHECK-BE-NEXT: lxvx v4, 0, r4
; CHECK-BE-NEXT: xxlxor v3, v3, v3
; CHECK-BE-NEXT: vperm v3, v3, v2, v4
; CHECK-BE-NEXT: addis r4, r2, .LCPI5_1@toc@ha
; CHECK-BE-NEXT: vextsb2d v3, v3
; CHECK-BE-NEXT: addi r4, r4, .LCPI5_1@toc@l
; CHECK-BE-NEXT: xvcvsxddp vs0, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: vperm v2, v2, v2, v3
; CHECK-BE-NEXT: stxv vs0, 16(r3)
; CHECK-BE-NEXT: vextsb2d v2, v2
; CHECK-BE-NEXT: xvcvsxddp vs1, v2
; CHECK-BE-NEXT: stxv vs1, 0(r3)
; CHECK-BE-NEXT: blr
entry:
%0 = bitcast i32 %a.coerce to <4 x i8>
%1 = sitofp <4 x i8> %0 to <4 x double>
store <4 x double> %1, <4 x double>* %agg.result, align 32
ret void
}
define void @test8elt_signed(<8 x double>* noalias nocapture sret %agg.result, i64 %a.coerce) local_unnamed_addr #1 {
; CHECK-P8-LABEL: test8elt_signed:
; CHECK-P8: # %bb.0: # %entry
; CHECK-P8-NEXT: mtvsrd f0, r4
; CHECK-P8-NEXT: addis r4, r2, .LCPI6_2@toc@ha
; CHECK-P8-NEXT: addis r5, r2, .LCPI6_0@toc@ha
; CHECK-P8-NEXT: addis r6, r2, .LCPI6_3@toc@ha
; CHECK-P8-NEXT: addi r4, r4, .LCPI6_2@toc@l
; CHECK-P8-NEXT: addi r5, r5, .LCPI6_0@toc@l
; CHECK-P8-NEXT: addi r6, r6, .LCPI6_3@toc@l
; CHECK-P8-NEXT: lvx v4, 0, r4
; CHECK-P8-NEXT: addis r4, r2, .LCPI6_4@toc@ha
; CHECK-P8-NEXT: lvx v2, 0, r5
; CHECK-P8-NEXT: xxswapd v3, vs0
; CHECK-P8-NEXT: lvx v5, 0, r6
; CHECK-P8-NEXT: addis r5, r2, .LCPI6_1@toc@ha
; CHECK-P8-NEXT: addi r4, r4, .LCPI6_4@toc@l
; CHECK-P8-NEXT: addi r5, r5, .LCPI6_1@toc@l
; CHECK-P8-NEXT: lvx v0, 0, r4
; CHECK-P8-NEXT: lxvd2x vs0, 0, r5
; CHECK-P8-NEXT: li r4, 48
; CHECK-P8-NEXT: li r5, 32
; CHECK-P8-NEXT: vperm v2, v3, v3, v2
; CHECK-P8-NEXT: vperm v4, v3, v3, v4
; CHECK-P8-NEXT: vperm v5, v3, v3, v5
; CHECK-P8-NEXT: vperm v3, v3, v3, v0
; CHECK-P8-NEXT: xxswapd v0, vs0
; CHECK-P8-NEXT: vsld v2, v2, v0
; CHECK-P8-NEXT: vsld v4, v4, v0
; CHECK-P8-NEXT: vsld v5, v5, v0
; CHECK-P8-NEXT: vsld v3, v3, v0
; CHECK-P8-NEXT: vsrad v2, v2, v0
; CHECK-P8-NEXT: vsrad v3, v3, v0
; CHECK-P8-NEXT: vsrad v4, v4, v0
; CHECK-P8-NEXT: vsrad v5, v5, v0
; CHECK-P8-NEXT: xvcvsxddp vs2, v3
; CHECK-P8-NEXT: xvcvsxddp vs0, v2
; CHECK-P8-NEXT: xvcvsxddp vs1, v5
; CHECK-P8-NEXT: xvcvsxddp vs3, v4
; CHECK-P8-NEXT: xxswapd vs2, vs2
; CHECK-P8-NEXT: xxswapd vs0, vs0
; CHECK-P8-NEXT: xxswapd vs1, vs1
; CHECK-P8-NEXT: xxswapd vs3, vs3
; CHECK-P8-NEXT: stxvd2x vs2, r3, r4
; CHECK-P8-NEXT: li r4, 16
; CHECK-P8-NEXT: stxvd2x vs1, r3, r5
; CHECK-P8-NEXT: stxvd2x vs3, r3, r4
; CHECK-P8-NEXT: stxvd2x vs0, 0, r3
; CHECK-P8-NEXT: blr
;
; CHECK-P9-LABEL: test8elt_signed:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: mtvsrd f0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI6_0@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI6_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: xxswapd v2, vs0
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: addis r4, r2, .LCPI6_1@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI6_1@toc@l
; CHECK-P9-NEXT: vextsb2d v3, v3
; CHECK-P9-NEXT: xvcvsxddp vs0, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI6_2@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI6_2@toc@l
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: stxv vs0, 0(r3)
; CHECK-P9-NEXT: vextsb2d v3, v3
; CHECK-P9-NEXT: xvcvsxddp vs1, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI6_3@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI6_3@toc@l
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: stxv vs1, 16(r3)
; CHECK-P9-NEXT: vextsb2d v3, v3
; CHECK-P9-NEXT: xvcvsxddp vs2, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: vperm v2, v2, v2, v3
; CHECK-P9-NEXT: stxv vs2, 32(r3)
; CHECK-P9-NEXT: vextsb2d v2, v2
; CHECK-P9-NEXT: xvcvsxddp vs3, v2
; CHECK-P9-NEXT: stxv vs3, 48(r3)
; CHECK-P9-NEXT: blr
;
; CHECK-BE-LABEL: test8elt_signed:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: mtvsrd v2, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI6_0@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI6_0@toc@l
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: vperm v3, v4, v2, v3
; CHECK-BE-NEXT: addis r4, r2, .LCPI6_1@toc@ha
; CHECK-BE-NEXT: vextsb2d v3, v3
; CHECK-BE-NEXT: addi r4, r4, .LCPI6_1@toc@l
; CHECK-BE-NEXT: xvcvsxddp vs0, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI6_2@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI6_2@toc@l
; CHECK-BE-NEXT: vperm v3, v4, v2, v3
; CHECK-BE-NEXT: stxv vs0, 16(r3)
; CHECK-BE-NEXT: vextsb2d v3, v3
; CHECK-BE-NEXT: xvcvsxddp vs1, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI6_3@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI6_3@toc@l
-; CHECK-BE-NEXT: vperm v3, v4, v2, v3
-; CHECK-BE-NEXT: stxv vs1, 32(r3)
+; CHECK-BE-NEXT: vperm v3, v2, v2, v3
+; CHECK-BE-NEXT: stxv vs1, 48(r3)
; CHECK-BE-NEXT: vextsb2d v3, v3
; CHECK-BE-NEXT: xvcvsxddp vs2, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: vperm v2, v2, v2, v3
-; CHECK-BE-NEXT: stxv vs2, 48(r3)
+; CHECK-BE-NEXT: stxv vs2, 0(r3)
; CHECK-BE-NEXT: vextsb2d v2, v2
; CHECK-BE-NEXT: xvcvsxddp vs3, v2
-; CHECK-BE-NEXT: stxv vs3, 0(r3)
+; CHECK-BE-NEXT: stxv vs3, 32(r3)
; CHECK-BE-NEXT: blr
entry:
%0 = bitcast i64 %a.coerce to <8 x i8>
%1 = sitofp <8 x i8> %0 to <8 x double>
store <8 x double> %1, <8 x double>* %agg.result, align 64
ret void
}
define void @test16elt_signed(<16 x double>* noalias nocapture sret %agg.result, <16 x i8> %a) local_unnamed_addr #2 {
; CHECK-P8-LABEL: test16elt_signed:
; CHECK-P8: # %bb.0: # %entry
; CHECK-P8-NEXT: addis r4, r2, .LCPI7_0@toc@ha
; CHECK-P8-NEXT: addis r5, r2, .LCPI7_2@toc@ha
; CHECK-P8-NEXT: addis r6, r2, .LCPI7_3@toc@ha
; CHECK-P8-NEXT: addi r4, r4, .LCPI7_0@toc@l
; CHECK-P8-NEXT: addi r5, r5, .LCPI7_2@toc@l
; CHECK-P8-NEXT: addi r6, r6, .LCPI7_3@toc@l
; CHECK-P8-NEXT: lvx v3, 0, r4
; CHECK-P8-NEXT: addis r4, r2, .LCPI7_4@toc@ha
; CHECK-P8-NEXT: lvx v4, 0, r5
; CHECK-P8-NEXT: addis r5, r2, .LCPI7_5@toc@ha
; CHECK-P8-NEXT: lvx v5, 0, r6
; CHECK-P8-NEXT: addis r6, r2, .LCPI7_1@toc@ha
; CHECK-P8-NEXT: addi r4, r4, .LCPI7_4@toc@l
; CHECK-P8-NEXT: addi r5, r5, .LCPI7_5@toc@l
; CHECK-P8-NEXT: addi r6, r6, .LCPI7_1@toc@l
; CHECK-P8-NEXT: lvx v0, 0, r4
; CHECK-P8-NEXT: addis r4, r2, .LCPI7_6@toc@ha
; CHECK-P8-NEXT: lvx v1, 0, r5
; CHECK-P8-NEXT: addis r5, r2, .LCPI7_7@toc@ha
; CHECK-P8-NEXT: lxvd2x vs0, 0, r6
; CHECK-P8-NEXT: addi r4, r4, .LCPI7_6@toc@l
; CHECK-P8-NEXT: addi r5, r5, .LCPI7_7@toc@l
; CHECK-P8-NEXT: vperm v3, v2, v2, v3
; CHECK-P8-NEXT: lvx v6, 0, r4
; CHECK-P8-NEXT: addis r4, r2, .LCPI7_8@toc@ha
; CHECK-P8-NEXT: lvx v7, 0, r5
; CHECK-P8-NEXT: vperm v4, v2, v2, v4
; CHECK-P8-NEXT: li r5, 96
; CHECK-P8-NEXT: addi r4, r4, .LCPI7_8@toc@l
; CHECK-P8-NEXT: vperm v5, v2, v2, v5
; CHECK-P8-NEXT: xxswapd v9, vs0
; CHECK-P8-NEXT: lvx v8, 0, r4
; CHECK-P8-NEXT: vperm v0, v2, v2, v0
; CHECK-P8-NEXT: li r4, 112
; CHECK-P8-NEXT: vperm v1, v2, v2, v1
; CHECK-P8-NEXT: vperm v6, v2, v2, v6
; CHECK-P8-NEXT: vperm v7, v2, v2, v7
; CHECK-P8-NEXT: vperm v2, v2, v2, v8
; CHECK-P8-NEXT: vsld v3, v3, v9
; CHECK-P8-NEXT: vsld v0, v0, v9
; CHECK-P8-NEXT: vsld v1, v1, v9
; CHECK-P8-NEXT: vsld v6, v6, v9
; CHECK-P8-NEXT: vsld v7, v7, v9
; CHECK-P8-NEXT: vsld v2, v2, v9
; CHECK-P8-NEXT: vsrad v7, v7, v9
; CHECK-P8-NEXT: vsrad v2, v2, v9
; CHECK-P8-NEXT: vsld v4, v4, v9
; CHECK-P8-NEXT: vsld v5, v5, v9
; CHECK-P8-NEXT: vsrad v6, v6, v9
; CHECK-P8-NEXT: vsrad v0, v0, v9
; CHECK-P8-NEXT: vsrad v1, v1, v9
; CHECK-P8-NEXT: xvcvsxddp vs2, v7
; CHECK-P8-NEXT: xvcvsxddp vs3, v2
; CHECK-P8-NEXT: vsrad v3, v3, v9
; CHECK-P8-NEXT: vsrad v4, v4, v9
; CHECK-P8-NEXT: vsrad v5, v5, v9
; CHECK-P8-NEXT: xvcvsxddp vs4, v6
; CHECK-P8-NEXT: xvcvsxddp vs1, v1
; CHECK-P8-NEXT: xxswapd vs2, vs2
; CHECK-P8-NEXT: xvcvsxddp vs5, v0
; CHECK-P8-NEXT: xxswapd vs3, vs3
; CHECK-P8-NEXT: xvcvsxddp vs0, v5
; CHECK-P8-NEXT: xvcvsxddp vs6, v3
; CHECK-P8-NEXT: xvcvsxddp vs7, v4
; CHECK-P8-NEXT: stxvd2x vs3, r3, r4
; CHECK-P8-NEXT: li r4, 80
; CHECK-P8-NEXT: xxswapd vs4, vs4
; CHECK-P8-NEXT: stxvd2x vs2, r3, r5
; CHECK-P8-NEXT: li r5, 64
; CHECK-P8-NEXT: xxswapd vs1, vs1
; CHECK-P8-NEXT: xxswapd vs5, vs5
; CHECK-P8-NEXT: xxswapd vs0, vs0
; CHECK-P8-NEXT: stxvd2x vs4, r3, r4
; CHECK-P8-NEXT: li r4, 48
; CHECK-P8-NEXT: xxswapd vs3, vs6
; CHECK-P8-NEXT: stxvd2x vs1, r3, r5
; CHECK-P8-NEXT: li r5, 32
; CHECK-P8-NEXT: xxswapd vs2, vs7
; CHECK-P8-NEXT: stxvd2x vs5, r3, r4
; CHECK-P8-NEXT: li r4, 16
; CHECK-P8-NEXT: stxvd2x vs0, r3, r5
; CHECK-P8-NEXT: stxvd2x vs2, r3, r4
; CHECK-P8-NEXT: stxvd2x vs3, 0, r3
; CHECK-P8-NEXT: blr
;
; CHECK-P9-LABEL: test16elt_signed:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: addis r4, r2, .LCPI7_0@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI7_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI7_1@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI7_1@toc@l
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: vextsb2d v3, v3
; CHECK-P9-NEXT: xvcvsxddp vs0, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI7_2@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI7_2@toc@l
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: stxv vs0, 0(r3)
; CHECK-P9-NEXT: vextsb2d v3, v3
; CHECK-P9-NEXT: xvcvsxddp vs1, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI7_3@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI7_3@toc@l
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: stxv vs1, 16(r3)
; CHECK-P9-NEXT: vextsb2d v3, v3
; CHECK-P9-NEXT: xvcvsxddp vs2, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI7_4@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI7_4@toc@l
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: stxv vs2, 32(r3)
; CHECK-P9-NEXT: vextsb2d v3, v3
; CHECK-P9-NEXT: xvcvsxddp vs3, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI7_5@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI7_5@toc@l
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: stxv vs3, 48(r3)
; CHECK-P9-NEXT: vextsb2d v3, v3
; CHECK-P9-NEXT: xvcvsxddp vs4, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI7_6@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI7_6@toc@l
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: stxv vs4, 64(r3)
; CHECK-P9-NEXT: vextsb2d v3, v3
; CHECK-P9-NEXT: xvcvsxddp vs5, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI7_7@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI7_7@toc@l
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: stxv vs5, 80(r3)
; CHECK-P9-NEXT: vextsb2d v3, v3
; CHECK-P9-NEXT: xvcvsxddp vs6, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: vperm v2, v2, v2, v3
; CHECK-P9-NEXT: stxv vs6, 96(r3)
; CHECK-P9-NEXT: vextsb2d v2, v2
; CHECK-P9-NEXT: xvcvsxddp vs7, v2
; CHECK-P9-NEXT: stxv vs7, 112(r3)
; CHECK-P9-NEXT: blr
;
; CHECK-BE-LABEL: test16elt_signed:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: addis r4, r2, .LCPI7_0@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI7_0@toc@l
; CHECK-BE-NEXT: lxvx v4, 0, r4
; CHECK-BE-NEXT: xxlxor v3, v3, v3
; CHECK-BE-NEXT: vperm v4, v3, v2, v4
; CHECK-BE-NEXT: addis r4, r2, .LCPI7_1@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI7_1@toc@l
; CHECK-BE-NEXT: vextsb2d v4, v4
; CHECK-BE-NEXT: xvcvsxddp vs0, v4
; CHECK-BE-NEXT: lxvx v4, 0, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI7_2@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI7_2@toc@l
; CHECK-BE-NEXT: vperm v4, v3, v2, v4
; CHECK-BE-NEXT: stxv vs0, 16(r3)
; CHECK-BE-NEXT: vextsb2d v4, v4
; CHECK-BE-NEXT: xvcvsxddp vs1, v4
; CHECK-BE-NEXT: lxvx v4, 0, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI7_3@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI7_3@toc@l
; CHECK-BE-NEXT: vperm v4, v3, v2, v4
-; CHECK-BE-NEXT: stxv vs1, 32(r3)
+; CHECK-BE-NEXT: stxv vs1, 48(r3)
; CHECK-BE-NEXT: vextsb2d v4, v4
; CHECK-BE-NEXT: xvcvsxddp vs2, v4
; CHECK-BE-NEXT: lxvx v4, 0, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI7_4@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI7_4@toc@l
-; CHECK-BE-NEXT: vperm v4, v3, v2, v4
-; CHECK-BE-NEXT: stxv vs2, 48(r3)
-; CHECK-BE-NEXT: vextsb2d v4, v4
-; CHECK-BE-NEXT: xvcvsxddp vs3, v4
-; CHECK-BE-NEXT: lxvx v4, 0, r4
+; CHECK-BE-NEXT: vperm v3, v3, v2, v4
+; CHECK-BE-NEXT: stxv vs2, 80(r3)
+; CHECK-BE-NEXT: vextsb2d v3, v3
+; CHECK-BE-NEXT: xvcvsxddp vs3, v3
+; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI7_5@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI7_5@toc@l
-; CHECK-BE-NEXT: vperm v4, v3, v2, v4
-; CHECK-BE-NEXT: stxv vs3, 64(r3)
-; CHECK-BE-NEXT: vextsb2d v4, v4
-; CHECK-BE-NEXT: xvcvsxddp vs4, v4
-; CHECK-BE-NEXT: lxvx v4, 0, r4
+; CHECK-BE-NEXT: vperm v3, v2, v2, v3
+; CHECK-BE-NEXT: stxv vs3, 112(r3)
+; CHECK-BE-NEXT: vextsb2d v3, v3
+; CHECK-BE-NEXT: xvcvsxddp vs4, v3
+; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI7_6@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI7_6@toc@l
-; CHECK-BE-NEXT: vperm v4, v3, v2, v4
-; CHECK-BE-NEXT: stxv vs4, 80(r3)
-; CHECK-BE-NEXT: vextsb2d v4, v4
-; CHECK-BE-NEXT: xvcvsxddp vs5, v4
-; CHECK-BE-NEXT: lxvx v4, 0, r4
+; CHECK-BE-NEXT: vperm v3, v2, v2, v3
+; CHECK-BE-NEXT: stxv vs4, 0(r3)
+; CHECK-BE-NEXT: vextsb2d v3, v3
+; CHECK-BE-NEXT: xvcvsxddp vs5, v3
+; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI7_7@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI7_7@toc@l
-; CHECK-BE-NEXT: vperm v3, v3, v2, v4
-; CHECK-BE-NEXT: stxv vs5, 96(r3)
+; CHECK-BE-NEXT: vperm v3, v2, v2, v3
+; CHECK-BE-NEXT: stxv vs5, 32(r3)
; CHECK-BE-NEXT: vextsb2d v3, v3
; CHECK-BE-NEXT: xvcvsxddp vs6, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: vperm v2, v2, v2, v3
-; CHECK-BE-NEXT: stxv vs6, 112(r3)
+; CHECK-BE-NEXT: stxv vs6, 64(r3)
; CHECK-BE-NEXT: vextsb2d v2, v2
; CHECK-BE-NEXT: xvcvsxddp vs7, v2
-; CHECK-BE-NEXT: stxv vs7, 0(r3)
+; CHECK-BE-NEXT: stxv vs7, 96(r3)
; CHECK-BE-NEXT: blr
entry:
%0 = sitofp <16 x i8> %a to <16 x double>
store <16 x double> %0, <16 x double>* %agg.result, align 128
ret void
}
Index: llvm/trunk/test/CodeGen/PowerPC/vec_conv_i16_to_fp64_elts.ll
===================================================================
--- llvm/trunk/test/CodeGen/PowerPC/vec_conv_i16_to_fp64_elts.ll (revision 357115)
+++ llvm/trunk/test/CodeGen/PowerPC/vec_conv_i16_to_fp64_elts.ll (revision 357116)
@@ -1,791 +1,791 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
; RUN: -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
; RUN: FileCheck %s --check-prefix=CHECK-P8
; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
; RUN: FileCheck %s --check-prefix=CHECK-P9
; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
; RUN: FileCheck %s --check-prefix=CHECK-BE
define <2 x double> @test2elt(i32 %a.coerce) local_unnamed_addr #0 {
; CHECK-P8-LABEL: test2elt:
; CHECK-P8: # %bb.0: # %entry
; CHECK-P8-NEXT: addis r4, r2, .LCPI0_0@toc@ha
; CHECK-P8-NEXT: mtvsrd f0, r3
; CHECK-P8-NEXT: addi r3, r4, .LCPI0_0@toc@l
; CHECK-P8-NEXT: xxlxor v4, v4, v4
; CHECK-P8-NEXT: xxswapd v2, vs0
; CHECK-P8-NEXT: lvx v3, 0, r3
; CHECK-P8-NEXT: vperm v2, v4, v2, v3
; CHECK-P8-NEXT: xvcvuxddp v2, v2
; CHECK-P8-NEXT: blr
;
; CHECK-P9-LABEL: test2elt:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: mtvsrws v2, r3
; CHECK-P9-NEXT: addis r3, r2, .LCPI0_0@toc@ha
; CHECK-P9-NEXT: addi r3, r3, .LCPI0_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r3
; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: vperm v2, v4, v2, v3
; CHECK-P9-NEXT: xvcvuxddp v2, v2
; CHECK-P9-NEXT: blr
;
; CHECK-BE-LABEL: test2elt:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: mtvsrws v2, r3
; CHECK-BE-NEXT: addis r3, r2, .LCPI0_0@toc@ha
; CHECK-BE-NEXT: addi r3, r3, .LCPI0_0@toc@l
; CHECK-BE-NEXT: lxvx v3, 0, r3
; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: vperm v2, v2, v4, v3
; CHECK-BE-NEXT: xvcvuxddp v2, v2
; CHECK-BE-NEXT: blr
entry:
%0 = bitcast i32 %a.coerce to <2 x i16>
%1 = uitofp <2 x i16> %0 to <2 x double>
ret <2 x double> %1
}
define void @test4elt(<4 x double>* noalias nocapture sret %agg.result, i64 %a.coerce) local_unnamed_addr #1 {
; CHECK-P8-LABEL: test4elt:
; CHECK-P8: # %bb.0: # %entry
; CHECK-P8-NEXT: addis r5, r2, .LCPI1_0@toc@ha
; CHECK-P8-NEXT: mtvsrd f0, r4
; CHECK-P8-NEXT: addis r4, r2, .LCPI1_1@toc@ha
; CHECK-P8-NEXT: addi r5, r5, .LCPI1_0@toc@l
; CHECK-P8-NEXT: addi r4, r4, .LCPI1_1@toc@l
; CHECK-P8-NEXT: xxlxor v4, v4, v4
; CHECK-P8-NEXT: lvx v2, 0, r5
; CHECK-P8-NEXT: xxswapd v3, vs0
; CHECK-P8-NEXT: lvx v5, 0, r4
; CHECK-P8-NEXT: li r4, 16
; CHECK-P8-NEXT: vperm v2, v4, v3, v2
; CHECK-P8-NEXT: vperm v3, v4, v3, v5
; CHECK-P8-NEXT: xvcvuxddp vs0, v2
; CHECK-P8-NEXT: xvcvuxddp vs1, v3
; CHECK-P8-NEXT: xxswapd vs0, vs0
; CHECK-P8-NEXT: xxswapd vs1, vs1
; CHECK-P8-NEXT: stxvd2x vs1, r3, r4
; CHECK-P8-NEXT: stxvd2x vs0, 0, r3
; CHECK-P8-NEXT: blr
;
; CHECK-P9-LABEL: test4elt:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: mtvsrd f0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI1_0@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI1_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: xxswapd v2, vs0
; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: addis r4, r2, .LCPI1_1@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI1_1@toc@l
; CHECK-P9-NEXT: vperm v3, v4, v2, v3
; CHECK-P9-NEXT: xvcvuxddp vs0, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: vperm v2, v4, v2, v3
; CHECK-P9-NEXT: stxv vs0, 0(r3)
; CHECK-P9-NEXT: xvcvuxddp vs1, v2
; CHECK-P9-NEXT: stxv vs1, 16(r3)
; CHECK-P9-NEXT: blr
;
; CHECK-BE-LABEL: test4elt:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: mtvsrd v2, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI1_0@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI1_0@toc@l
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: addis r4, r2, .LCPI1_1@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI1_1@toc@l
; CHECK-BE-NEXT: vperm v3, v2, v4, v3
; CHECK-BE-NEXT: xvcvuxddp vs0, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: vperm v2, v4, v2, v3
; CHECK-BE-NEXT: stxv vs0, 0(r3)
; CHECK-BE-NEXT: xvcvuxddp vs1, v2
; CHECK-BE-NEXT: stxv vs1, 16(r3)
; CHECK-BE-NEXT: blr
entry:
%0 = bitcast i64 %a.coerce to <4 x i16>
%1 = uitofp <4 x i16> %0 to <4 x double>
store <4 x double> %1, <4 x double>* %agg.result, align 32
ret void
}
define void @test8elt(<8 x double>* noalias nocapture sret %agg.result, <8 x i16> %a) local_unnamed_addr #2 {
; CHECK-P8-LABEL: test8elt:
; CHECK-P8: # %bb.0: # %entry
; CHECK-P8-NEXT: addis r4, r2, .LCPI2_0@toc@ha
; CHECK-P8-NEXT: addis r5, r2, .LCPI2_2@toc@ha
; CHECK-P8-NEXT: xxlxor v4, v4, v4
; CHECK-P8-NEXT: addi r4, r4, .LCPI2_0@toc@l
; CHECK-P8-NEXT: addi r5, r5, .LCPI2_2@toc@l
; CHECK-P8-NEXT: lvx v3, 0, r4
; CHECK-P8-NEXT: addis r4, r2, .LCPI2_3@toc@ha
; CHECK-P8-NEXT: lvx v5, 0, r5
; CHECK-P8-NEXT: addis r5, r2, .LCPI2_1@toc@ha
; CHECK-P8-NEXT: addi r4, r4, .LCPI2_3@toc@l
; CHECK-P8-NEXT: addi r5, r5, .LCPI2_1@toc@l
; CHECK-P8-NEXT: lvx v0, 0, r4
; CHECK-P8-NEXT: lvx v1, 0, r5
; CHECK-P8-NEXT: li r4, 48
; CHECK-P8-NEXT: li r5, 32
; CHECK-P8-NEXT: vperm v3, v4, v2, v3
; CHECK-P8-NEXT: vperm v5, v4, v2, v5
; CHECK-P8-NEXT: vperm v0, v4, v2, v0
; CHECK-P8-NEXT: vperm v2, v4, v2, v1
; CHECK-P8-NEXT: xvcvuxddp vs0, v3
; CHECK-P8-NEXT: xvcvuxddp vs1, v5
; CHECK-P8-NEXT: xvcvuxddp vs2, v0
; CHECK-P8-NEXT: xvcvuxddp vs3, v2
; CHECK-P8-NEXT: xxswapd vs0, vs0
; CHECK-P8-NEXT: xxswapd vs1, vs1
; CHECK-P8-NEXT: xxswapd vs2, vs2
; CHECK-P8-NEXT: xxswapd vs3, vs3
; CHECK-P8-NEXT: stxvd2x vs2, r3, r4
; CHECK-P8-NEXT: li r4, 16
; CHECK-P8-NEXT: stxvd2x vs1, r3, r5
; CHECK-P8-NEXT: stxvd2x vs3, r3, r4
; CHECK-P8-NEXT: stxvd2x vs0, 0, r3
; CHECK-P8-NEXT: blr
;
; CHECK-P9-LABEL: test8elt:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: addis r4, r2, .LCPI2_0@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI2_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: addis r4, r2, .LCPI2_1@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI2_1@toc@l
; CHECK-P9-NEXT: vperm v3, v4, v2, v3
; CHECK-P9-NEXT: xvcvuxddp vs0, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI2_2@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI2_2@toc@l
; CHECK-P9-NEXT: vperm v3, v4, v2, v3
; CHECK-P9-NEXT: stxv vs0, 0(r3)
; CHECK-P9-NEXT: xvcvuxddp vs1, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI2_3@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI2_3@toc@l
; CHECK-P9-NEXT: vperm v3, v4, v2, v3
; CHECK-P9-NEXT: stxv vs1, 16(r3)
; CHECK-P9-NEXT: xvcvuxddp vs2, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: vperm v2, v4, v2, v3
; CHECK-P9-NEXT: stxv vs2, 32(r3)
; CHECK-P9-NEXT: xvcvuxddp vs3, v2
; CHECK-P9-NEXT: stxv vs3, 48(r3)
; CHECK-P9-NEXT: blr
;
; CHECK-BE-LABEL: test8elt:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: addis r4, r2, .LCPI2_0@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI2_0@toc@l
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: addis r4, r2, .LCPI2_1@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI2_1@toc@l
; CHECK-BE-NEXT: vperm v3, v2, v4, v3
; CHECK-BE-NEXT: xvcvuxddp vs0, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI2_2@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI2_2@toc@l
; CHECK-BE-NEXT: vperm v3, v4, v2, v3
; CHECK-BE-NEXT: stxv vs0, 0(r3)
; CHECK-BE-NEXT: xvcvuxddp vs1, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI2_3@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI2_3@toc@l
; CHECK-BE-NEXT: vperm v3, v4, v2, v3
; CHECK-BE-NEXT: stxv vs1, 16(r3)
; CHECK-BE-NEXT: xvcvuxddp vs2, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: vperm v2, v4, v2, v3
; CHECK-BE-NEXT: stxv vs2, 32(r3)
; CHECK-BE-NEXT: xvcvuxddp vs3, v2
; CHECK-BE-NEXT: stxv vs3, 48(r3)
; CHECK-BE-NEXT: blr
entry:
%0 = uitofp <8 x i16> %a to <8 x double>
store <8 x double> %0, <8 x double>* %agg.result, align 64
ret void
}
define void @test16elt(<16 x double>* noalias nocapture sret %agg.result, <16 x i16>* nocapture readonly) local_unnamed_addr #3 {
; CHECK-P8-LABEL: test16elt:
; CHECK-P8: # %bb.0: # %entry
; CHECK-P8-NEXT: addis r6, r2, .LCPI3_2@toc@ha
; CHECK-P8-NEXT: addis r5, r2, .LCPI3_0@toc@ha
; CHECK-P8-NEXT: lvx v4, 0, r4
; CHECK-P8-NEXT: xxlxor v3, v3, v3
; CHECK-P8-NEXT: addi r6, r6, .LCPI3_2@toc@l
; CHECK-P8-NEXT: addi r5, r5, .LCPI3_0@toc@l
; CHECK-P8-NEXT: lvx v5, 0, r6
; CHECK-P8-NEXT: li r6, 16
; CHECK-P8-NEXT: lvx v2, 0, r5
; CHECK-P8-NEXT: addis r5, r2, .LCPI3_1@toc@ha
; CHECK-P8-NEXT: lvx v0, r4, r6
; CHECK-P8-NEXT: addis r4, r2, .LCPI3_3@toc@ha
; CHECK-P8-NEXT: addi r5, r5, .LCPI3_1@toc@l
; CHECK-P8-NEXT: addi r4, r4, .LCPI3_3@toc@l
; CHECK-P8-NEXT: lvx v1, 0, r5
; CHECK-P8-NEXT: li r5, 96
; CHECK-P8-NEXT: lvx v8, 0, r4
; CHECK-P8-NEXT: vperm v6, v3, v4, v2
; CHECK-P8-NEXT: li r4, 112
; CHECK-P8-NEXT: vperm v7, v3, v4, v5
; CHECK-P8-NEXT: vperm v2, v3, v0, v2
; CHECK-P8-NEXT: vperm v9, v3, v0, v1
; CHECK-P8-NEXT: vperm v5, v3, v0, v5
; CHECK-P8-NEXT: vperm v0, v3, v0, v8
; CHECK-P8-NEXT: vperm v1, v3, v4, v1
; CHECK-P8-NEXT: vperm v3, v3, v4, v8
; CHECK-P8-NEXT: xvcvuxddp vs1, v2
; CHECK-P8-NEXT: xvcvuxddp vs4, v9
; CHECK-P8-NEXT: xvcvuxddp vs2, v5
; CHECK-P8-NEXT: xvcvuxddp vs3, v0
; CHECK-P8-NEXT: xvcvuxddp vs0, v7
; CHECK-P8-NEXT: xvcvuxddp vs5, v3
; CHECK-P8-NEXT: xvcvuxddp vs6, v6
; CHECK-P8-NEXT: xxswapd vs1, vs1
; CHECK-P8-NEXT: xvcvuxddp vs7, v1
; CHECK-P8-NEXT: xxswapd vs4, vs4
; CHECK-P8-NEXT: xxswapd vs2, vs2
; CHECK-P8-NEXT: xxswapd vs3, vs3
; CHECK-P8-NEXT: xxswapd vs0, vs0
; CHECK-P8-NEXT: xxswapd vs5, vs5
; CHECK-P8-NEXT: stxvd2x vs3, r3, r4
; CHECK-P8-NEXT: stxvd2x vs2, r3, r5
; CHECK-P8-NEXT: li r4, 80
; CHECK-P8-NEXT: li r5, 64
; CHECK-P8-NEXT: xxswapd vs2, vs7
; CHECK-P8-NEXT: xxswapd vs3, vs6
; CHECK-P8-NEXT: stxvd2x vs4, r3, r4
; CHECK-P8-NEXT: li r4, 48
; CHECK-P8-NEXT: stxvd2x vs1, r3, r5
; CHECK-P8-NEXT: li r5, 32
; CHECK-P8-NEXT: stxvd2x vs5, r3, r4
; CHECK-P8-NEXT: stxvd2x vs0, r3, r5
; CHECK-P8-NEXT: stxvd2x vs2, r3, r6
; CHECK-P8-NEXT: stxvd2x vs3, 0, r3
; CHECK-P8-NEXT: blr
;
; CHECK-P9-LABEL: test16elt:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv v2, 16(r4)
; CHECK-P9-NEXT: lxv v3, 0(r4)
; CHECK-P9-NEXT: addis r4, r2, .LCPI3_0@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI3_0@toc@l
; CHECK-P9-NEXT: lxvx v4, 0, r4
; CHECK-P9-NEXT: xxlxor v5, v5, v5
; CHECK-P9-NEXT: addis r4, r2, .LCPI3_1@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI3_1@toc@l
; CHECK-P9-NEXT: vperm v0, v5, v3, v4
; CHECK-P9-NEXT: xvcvuxddp vs0, v0
; CHECK-P9-NEXT: lxvx v0, 0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI3_2@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI3_2@toc@l
; CHECK-P9-NEXT: vperm v1, v5, v3, v0
; CHECK-P9-NEXT: stxv vs0, 0(r3)
; CHECK-P9-NEXT: xvcvuxddp vs1, v1
; CHECK-P9-NEXT: lxvx v1, 0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI3_3@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI3_3@toc@l
; CHECK-P9-NEXT: vperm v6, v5, v3, v1
; CHECK-P9-NEXT: stxv vs1, 16(r3)
; CHECK-P9-NEXT: xvcvuxddp vs2, v6
; CHECK-P9-NEXT: lxvx v6, 0, r4
; CHECK-P9-NEXT: vperm v3, v5, v3, v6
; CHECK-P9-NEXT: stxv vs2, 32(r3)
; CHECK-P9-NEXT: xvcvuxddp vs3, v3
; CHECK-P9-NEXT: vperm v3, v5, v2, v4
; CHECK-P9-NEXT: xvcvuxddp vs4, v3
; CHECK-P9-NEXT: vperm v3, v5, v2, v0
; CHECK-P9-NEXT: xvcvuxddp vs5, v3
; CHECK-P9-NEXT: vperm v3, v5, v2, v1
; CHECK-P9-NEXT: vperm v2, v5, v2, v6
; CHECK-P9-NEXT: stxv vs3, 48(r3)
; CHECK-P9-NEXT: xvcvuxddp vs6, v3
; CHECK-P9-NEXT: xvcvuxddp vs7, v2
; CHECK-P9-NEXT: stxv vs4, 64(r3)
; CHECK-P9-NEXT: stxv vs5, 80(r3)
; CHECK-P9-NEXT: stxv vs7, 112(r3)
; CHECK-P9-NEXT: stxv vs6, 96(r3)
; CHECK-P9-NEXT: blr
;
; CHECK-BE-LABEL: test16elt:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv v2, 16(r4)
; CHECK-BE-NEXT: lxv v3, 0(r4)
; CHECK-BE-NEXT: addis r4, r2, .LCPI3_0@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI3_0@toc@l
; CHECK-BE-NEXT: lxvx v4, 0, r4
; CHECK-BE-NEXT: xxlxor v5, v5, v5
; CHECK-BE-NEXT: addis r4, r2, .LCPI3_1@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI3_1@toc@l
; CHECK-BE-NEXT: vperm v0, v3, v5, v4
; CHECK-BE-NEXT: xvcvuxddp vs0, v0
; CHECK-BE-NEXT: lxvx v0, 0, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI3_2@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI3_2@toc@l
; CHECK-BE-NEXT: vperm v1, v5, v3, v0
; CHECK-BE-NEXT: stxv vs0, 0(r3)
; CHECK-BE-NEXT: xvcvuxddp vs1, v1
; CHECK-BE-NEXT: lxvx v1, 0, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI3_3@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI3_3@toc@l
; CHECK-BE-NEXT: vperm v6, v5, v3, v1
; CHECK-BE-NEXT: stxv vs1, 16(r3)
; CHECK-BE-NEXT: xvcvuxddp vs2, v6
; CHECK-BE-NEXT: lxvx v6, 0, r4
; CHECK-BE-NEXT: vperm v3, v5, v3, v6
; CHECK-BE-NEXT: stxv vs2, 32(r3)
; CHECK-BE-NEXT: xvcvuxddp vs3, v3
; CHECK-BE-NEXT: vperm v3, v2, v5, v4
; CHECK-BE-NEXT: xvcvuxddp vs4, v3
; CHECK-BE-NEXT: vperm v3, v5, v2, v0
; CHECK-BE-NEXT: xvcvuxddp vs5, v3
; CHECK-BE-NEXT: vperm v3, v5, v2, v1
; CHECK-BE-NEXT: vperm v2, v5, v2, v6
; CHECK-BE-NEXT: stxv vs3, 48(r3)
; CHECK-BE-NEXT: xvcvuxddp vs6, v3
; CHECK-BE-NEXT: xvcvuxddp vs7, v2
; CHECK-BE-NEXT: stxv vs4, 64(r3)
; CHECK-BE-NEXT: stxv vs5, 80(r3)
; CHECK-BE-NEXT: stxv vs7, 112(r3)
; CHECK-BE-NEXT: stxv vs6, 96(r3)
; CHECK-BE-NEXT: blr
entry:
%a = load <16 x i16>, <16 x i16>* %0, align 32
%1 = uitofp <16 x i16> %a to <16 x double>
store <16 x double> %1, <16 x double>* %agg.result, align 128
ret void
}
define <2 x double> @test2elt_signed(i32 %a.coerce) local_unnamed_addr #0 {
; CHECK-P8-LABEL: test2elt_signed:
; CHECK-P8: # %bb.0: # %entry
; CHECK-P8-NEXT: addis r4, r2, .LCPI4_0@toc@ha
; CHECK-P8-NEXT: mtvsrd f0, r3
; CHECK-P8-NEXT: addi r3, r4, .LCPI4_0@toc@l
; CHECK-P8-NEXT: xxswapd v2, vs0
; CHECK-P8-NEXT: lvx v3, 0, r3
; CHECK-P8-NEXT: addis r3, r2, .LCPI4_1@toc@ha
; CHECK-P8-NEXT: addi r3, r3, .LCPI4_1@toc@l
; CHECK-P8-NEXT: lxvd2x vs0, 0, r3
; CHECK-P8-NEXT: vperm v2, v2, v2, v3
; CHECK-P8-NEXT: xxswapd v3, vs0
; CHECK-P8-NEXT: vsld v2, v2, v3
; CHECK-P8-NEXT: vsrad v2, v2, v3
; CHECK-P8-NEXT: xvcvsxddp v2, v2
; CHECK-P8-NEXT: blr
;
; CHECK-P9-LABEL: test2elt_signed:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: mtvsrws v2, r3
; CHECK-P9-NEXT: addis r3, r2, .LCPI4_0@toc@ha
; CHECK-P9-NEXT: addi r3, r3, .LCPI4_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r3
; CHECK-P9-NEXT: vperm v2, v2, v2, v3
; CHECK-P9-NEXT: vextsh2d v2, v2
; CHECK-P9-NEXT: xvcvsxddp v2, v2
; CHECK-P9-NEXT: blr
;
; CHECK-BE-LABEL: test2elt_signed:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: mtvsrws v2, r3
; CHECK-BE-NEXT: addis r3, r2, .LCPI4_0@toc@ha
; CHECK-BE-NEXT: addi r3, r3, .LCPI4_0@toc@l
; CHECK-BE-NEXT: lxvx v3, 0, r3
; CHECK-BE-NEXT: vperm v2, v2, v2, v3
; CHECK-BE-NEXT: vextsh2d v2, v2
; CHECK-BE-NEXT: xvcvsxddp v2, v2
; CHECK-BE-NEXT: blr
entry:
%0 = bitcast i32 %a.coerce to <2 x i16>
%1 = sitofp <2 x i16> %0 to <2 x double>
ret <2 x double> %1
}
define void @test4elt_signed(<4 x double>* noalias nocapture sret %agg.result, i64 %a.coerce) local_unnamed_addr #1 {
; CHECK-P8-LABEL: test4elt_signed:
; CHECK-P8: # %bb.0: # %entry
; CHECK-P8-NEXT: addis r5, r2, .LCPI5_0@toc@ha
; CHECK-P8-NEXT: mtvsrd f0, r4
; CHECK-P8-NEXT: addis r4, r2, .LCPI5_2@toc@ha
; CHECK-P8-NEXT: addi r5, r5, .LCPI5_0@toc@l
; CHECK-P8-NEXT: addi r4, r4, .LCPI5_2@toc@l
; CHECK-P8-NEXT: lvx v2, 0, r5
; CHECK-P8-NEXT: xxswapd v3, vs0
; CHECK-P8-NEXT: lvx v4, 0, r4
; CHECK-P8-NEXT: addis r4, r2, .LCPI5_1@toc@ha
; CHECK-P8-NEXT: addi r4, r4, .LCPI5_1@toc@l
; CHECK-P8-NEXT: lxvd2x vs0, 0, r4
; CHECK-P8-NEXT: li r4, 16
; CHECK-P8-NEXT: vperm v2, v3, v3, v2
; CHECK-P8-NEXT: vperm v3, v3, v3, v4
; CHECK-P8-NEXT: xxswapd v4, vs0
; CHECK-P8-NEXT: vsld v2, v2, v4
; CHECK-P8-NEXT: vsld v3, v3, v4
; CHECK-P8-NEXT: vsrad v2, v2, v4
; CHECK-P8-NEXT: vsrad v3, v3, v4
; CHECK-P8-NEXT: xvcvsxddp vs0, v2
; CHECK-P8-NEXT: xvcvsxddp vs1, v3
; CHECK-P8-NEXT: xxswapd vs0, vs0
; CHECK-P8-NEXT: xxswapd vs1, vs1
; CHECK-P8-NEXT: stxvd2x vs1, r3, r4
; CHECK-P8-NEXT: stxvd2x vs0, 0, r3
; CHECK-P8-NEXT: blr
;
; CHECK-P9-LABEL: test4elt_signed:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: mtvsrd f0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI5_0@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI5_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: xxswapd v2, vs0
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: addis r4, r2, .LCPI5_1@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI5_1@toc@l
; CHECK-P9-NEXT: vextsh2d v3, v3
; CHECK-P9-NEXT: xvcvsxddp vs0, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: vperm v2, v2, v2, v3
; CHECK-P9-NEXT: stxv vs0, 0(r3)
; CHECK-P9-NEXT: vextsh2d v2, v2
; CHECK-P9-NEXT: xvcvsxddp vs1, v2
; CHECK-P9-NEXT: stxv vs1, 16(r3)
; CHECK-P9-NEXT: blr
;
; CHECK-BE-LABEL: test4elt_signed:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: mtvsrd v2, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI5_0@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI5_0@toc@l
; CHECK-BE-NEXT: lxvx v4, 0, r4
; CHECK-BE-NEXT: xxlxor v3, v3, v3
; CHECK-BE-NEXT: vperm v3, v3, v2, v4
; CHECK-BE-NEXT: addis r4, r2, .LCPI5_1@toc@ha
; CHECK-BE-NEXT: vextsh2d v3, v3
; CHECK-BE-NEXT: addi r4, r4, .LCPI5_1@toc@l
; CHECK-BE-NEXT: xvcvsxddp vs0, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: vperm v2, v2, v2, v3
; CHECK-BE-NEXT: stxv vs0, 16(r3)
; CHECK-BE-NEXT: vextsh2d v2, v2
; CHECK-BE-NEXT: xvcvsxddp vs1, v2
; CHECK-BE-NEXT: stxv vs1, 0(r3)
; CHECK-BE-NEXT: blr
entry:
%0 = bitcast i64 %a.coerce to <4 x i16>
%1 = sitofp <4 x i16> %0 to <4 x double>
store <4 x double> %1, <4 x double>* %agg.result, align 32
ret void
}
define void @test8elt_signed(<8 x double>* noalias nocapture sret %agg.result, <8 x i16> %a) local_unnamed_addr #2 {
; CHECK-P8-LABEL: test8elt_signed:
; CHECK-P8: # %bb.0: # %entry
; CHECK-P8-NEXT: addis r5, r2, .LCPI6_2@toc@ha
; CHECK-P8-NEXT: addis r4, r2, .LCPI6_0@toc@ha
; CHECK-P8-NEXT: addis r6, r2, .LCPI6_3@toc@ha
; CHECK-P8-NEXT: addi r5, r5, .LCPI6_2@toc@l
; CHECK-P8-NEXT: addi r4, r4, .LCPI6_0@toc@l
; CHECK-P8-NEXT: addi r6, r6, .LCPI6_3@toc@l
; CHECK-P8-NEXT: lvx v4, 0, r5
; CHECK-P8-NEXT: addis r5, r2, .LCPI6_4@toc@ha
; CHECK-P8-NEXT: lvx v3, 0, r4
; CHECK-P8-NEXT: lvx v5, 0, r6
; CHECK-P8-NEXT: addis r4, r2, .LCPI6_1@toc@ha
; CHECK-P8-NEXT: addi r5, r5, .LCPI6_4@toc@l
; CHECK-P8-NEXT: addi r4, r4, .LCPI6_1@toc@l
; CHECK-P8-NEXT: lvx v0, 0, r5
; CHECK-P8-NEXT: lxvd2x vs0, 0, r4
; CHECK-P8-NEXT: li r4, 48
; CHECK-P8-NEXT: li r5, 32
; CHECK-P8-NEXT: vperm v3, v2, v2, v3
; CHECK-P8-NEXT: vperm v4, v2, v2, v4
; CHECK-P8-NEXT: vperm v5, v2, v2, v5
; CHECK-P8-NEXT: vperm v2, v2, v2, v0
; CHECK-P8-NEXT: xxswapd v0, vs0
; CHECK-P8-NEXT: vsld v3, v3, v0
; CHECK-P8-NEXT: vsld v4, v4, v0
; CHECK-P8-NEXT: vsld v5, v5, v0
; CHECK-P8-NEXT: vsld v2, v2, v0
; CHECK-P8-NEXT: vsrad v3, v3, v0
; CHECK-P8-NEXT: vsrad v2, v2, v0
; CHECK-P8-NEXT: vsrad v4, v4, v0
; CHECK-P8-NEXT: vsrad v5, v5, v0
; CHECK-P8-NEXT: xvcvsxddp vs2, v2
; CHECK-P8-NEXT: xvcvsxddp vs0, v3
; CHECK-P8-NEXT: xvcvsxddp vs1, v5
; CHECK-P8-NEXT: xvcvsxddp vs3, v4
; CHECK-P8-NEXT: xxswapd vs2, vs2
; CHECK-P8-NEXT: xxswapd vs0, vs0
; CHECK-P8-NEXT: xxswapd vs1, vs1
; CHECK-P8-NEXT: xxswapd vs3, vs3
; CHECK-P8-NEXT: stxvd2x vs2, r3, r4
; CHECK-P8-NEXT: li r4, 16
; CHECK-P8-NEXT: stxvd2x vs1, r3, r5
; CHECK-P8-NEXT: stxvd2x vs3, r3, r4
; CHECK-P8-NEXT: stxvd2x vs0, 0, r3
; CHECK-P8-NEXT: blr
;
; CHECK-P9-LABEL: test8elt_signed:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: addis r4, r2, .LCPI6_0@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI6_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI6_1@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI6_1@toc@l
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: vextsh2d v3, v3
; CHECK-P9-NEXT: xvcvsxddp vs0, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI6_2@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI6_2@toc@l
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: stxv vs0, 0(r3)
; CHECK-P9-NEXT: vextsh2d v3, v3
; CHECK-P9-NEXT: xvcvsxddp vs1, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI6_3@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI6_3@toc@l
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: stxv vs1, 16(r3)
; CHECK-P9-NEXT: vextsh2d v3, v3
; CHECK-P9-NEXT: xvcvsxddp vs2, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: vperm v2, v2, v2, v3
; CHECK-P9-NEXT: stxv vs2, 32(r3)
; CHECK-P9-NEXT: vextsh2d v2, v2
; CHECK-P9-NEXT: xvcvsxddp vs3, v2
; CHECK-P9-NEXT: stxv vs3, 48(r3)
; CHECK-P9-NEXT: blr
;
; CHECK-BE-LABEL: test8elt_signed:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: addis r4, r2, .LCPI6_0@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI6_0@toc@l
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: vperm v3, v4, v2, v3
; CHECK-BE-NEXT: addis r4, r2, .LCPI6_1@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI6_1@toc@l
; CHECK-BE-NEXT: vextsh2d v3, v3
; CHECK-BE-NEXT: xvcvsxddp vs0, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI6_2@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI6_2@toc@l
; CHECK-BE-NEXT: vperm v3, v4, v2, v3
; CHECK-BE-NEXT: stxv vs0, 16(r3)
; CHECK-BE-NEXT: vextsh2d v3, v3
; CHECK-BE-NEXT: xvcvsxddp vs1, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI6_3@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI6_3@toc@l
-; CHECK-BE-NEXT: vperm v3, v4, v2, v3
-; CHECK-BE-NEXT: stxv vs1, 32(r3)
+; CHECK-BE-NEXT: vperm v3, v2, v2, v3
+; CHECK-BE-NEXT: stxv vs1, 48(r3)
; CHECK-BE-NEXT: vextsh2d v3, v3
; CHECK-BE-NEXT: xvcvsxddp vs2, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: vperm v2, v2, v2, v3
-; CHECK-BE-NEXT: stxv vs2, 48(r3)
+; CHECK-BE-NEXT: stxv vs2, 0(r3)
; CHECK-BE-NEXT: vextsh2d v2, v2
; CHECK-BE-NEXT: xvcvsxddp vs3, v2
-; CHECK-BE-NEXT: stxv vs3, 0(r3)
+; CHECK-BE-NEXT: stxv vs3, 32(r3)
; CHECK-BE-NEXT: blr
entry:
%0 = sitofp <8 x i16> %a to <8 x double>
store <8 x double> %0, <8 x double>* %agg.result, align 64
ret void
}
define void @test16elt_signed(<16 x double>* noalias nocapture sret %agg.result, <16 x i16>* nocapture readonly) local_unnamed_addr #3 {
; CHECK-P8-LABEL: test16elt_signed:
; CHECK-P8: # %bb.0: # %entry
; CHECK-P8-NEXT: addis r5, r2, .LCPI7_0@toc@ha
; CHECK-P8-NEXT: addis r6, r2, .LCPI7_2@toc@ha
; CHECK-P8-NEXT: lvx v4, 0, r4
; CHECK-P8-NEXT: addi r5, r5, .LCPI7_0@toc@l
; CHECK-P8-NEXT: addi r6, r6, .LCPI7_2@toc@l
; CHECK-P8-NEXT: lvx v2, 0, r5
; CHECK-P8-NEXT: addis r5, r2, .LCPI7_3@toc@ha
; CHECK-P8-NEXT: lvx v3, 0, r6
; CHECK-P8-NEXT: addis r6, r2, .LCPI7_4@toc@ha
; CHECK-P8-NEXT: addi r5, r5, .LCPI7_3@toc@l
; CHECK-P8-NEXT: addi r6, r6, .LCPI7_4@toc@l
; CHECK-P8-NEXT: lvx v5, 0, r5
; CHECK-P8-NEXT: lvx v0, 0, r6
; CHECK-P8-NEXT: li r6, 16
; CHECK-P8-NEXT: addis r5, r2, .LCPI7_1@toc@ha
; CHECK-P8-NEXT: lvx v7, r4, r6
; CHECK-P8-NEXT: addi r5, r5, .LCPI7_1@toc@l
; CHECK-P8-NEXT: vperm v1, v4, v4, v2
; CHECK-P8-NEXT: li r4, 112
; CHECK-P8-NEXT: vperm v6, v4, v4, v3
; CHECK-P8-NEXT: lxvd2x vs0, 0, r5
; CHECK-P8-NEXT: li r5, 96
; CHECK-P8-NEXT: vperm v8, v4, v4, v5
; CHECK-P8-NEXT: vperm v4, v4, v4, v0
; CHECK-P8-NEXT: vperm v5, v7, v7, v5
; CHECK-P8-NEXT: xxswapd v9, vs0
; CHECK-P8-NEXT: vperm v0, v7, v7, v0
; CHECK-P8-NEXT: vperm v2, v7, v7, v2
; CHECK-P8-NEXT: vperm v3, v7, v7, v3
; CHECK-P8-NEXT: vsld v1, v1, v9
; CHECK-P8-NEXT: vsld v6, v6, v9
; CHECK-P8-NEXT: vsld v5, v5, v9
; CHECK-P8-NEXT: vsld v0, v0, v9
; CHECK-P8-NEXT: vsld v2, v2, v9
; CHECK-P8-NEXT: vsld v3, v3, v9
; CHECK-P8-NEXT: vsrad v5, v5, v9
; CHECK-P8-NEXT: vsrad v0, v0, v9
; CHECK-P8-NEXT: vsld v7, v8, v9
; CHECK-P8-NEXT: vsld v4, v4, v9
; CHECK-P8-NEXT: vsrad v2, v2, v9
; CHECK-P8-NEXT: vsrad v3, v3, v9
; CHECK-P8-NEXT: xvcvsxddp vs2, v5
; CHECK-P8-NEXT: xvcvsxddp vs3, v0
; CHECK-P8-NEXT: vsrad v1, v1, v9
; CHECK-P8-NEXT: vsrad v6, v6, v9
; CHECK-P8-NEXT: vsrad v7, v7, v9
; CHECK-P8-NEXT: vsrad v4, v4, v9
; CHECK-P8-NEXT: xvcvsxddp vs1, v2
; CHECK-P8-NEXT: xxswapd vs2, vs2
; CHECK-P8-NEXT: xvcvsxddp vs4, v3
; CHECK-P8-NEXT: xxswapd vs3, vs3
; CHECK-P8-NEXT: xvcvsxddp vs0, v7
; CHECK-P8-NEXT: xvcvsxddp vs5, v4
; CHECK-P8-NEXT: xvcvsxddp vs6, v1
; CHECK-P8-NEXT: stxvd2x vs3, r3, r4
; CHECK-P8-NEXT: li r4, 80
; CHECK-P8-NEXT: xvcvsxddp vs7, v6
; CHECK-P8-NEXT: stxvd2x vs2, r3, r5
; CHECK-P8-NEXT: li r5, 64
; CHECK-P8-NEXT: xxswapd vs1, vs1
; CHECK-P8-NEXT: xxswapd vs4, vs4
; CHECK-P8-NEXT: xxswapd vs0, vs0
; CHECK-P8-NEXT: xxswapd vs5, vs5
; CHECK-P8-NEXT: xxswapd vs3, vs6
; CHECK-P8-NEXT: stxvd2x vs4, r3, r4
; CHECK-P8-NEXT: li r4, 48
; CHECK-P8-NEXT: xxswapd vs2, vs7
; CHECK-P8-NEXT: stxvd2x vs1, r3, r5
; CHECK-P8-NEXT: li r5, 32
; CHECK-P8-NEXT: stxvd2x vs5, r3, r4
; CHECK-P8-NEXT: stxvd2x vs0, r3, r5
; CHECK-P8-NEXT: stxvd2x vs2, r3, r6
; CHECK-P8-NEXT: stxvd2x vs3, 0, r3
; CHECK-P8-NEXT: blr
;
; CHECK-P9-LABEL: test16elt_signed:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: addis r5, r2, .LCPI7_0@toc@ha
; CHECK-P9-NEXT: addi r5, r5, .LCPI7_0@toc@l
; CHECK-P9-NEXT: lxv v2, 0(r4)
; CHECK-P9-NEXT: lxvx v3, 0, r5
; CHECK-P9-NEXT: addis r5, r2, .LCPI7_1@toc@ha
; CHECK-P9-NEXT: addi r5, r5, .LCPI7_1@toc@l
; CHECK-P9-NEXT: lxvx v5, 0, r5
; CHECK-P9-NEXT: addis r5, r2, .LCPI7_2@toc@ha
; CHECK-P9-NEXT: vperm v4, v2, v2, v3
; CHECK-P9-NEXT: addi r5, r5, .LCPI7_2@toc@l
; CHECK-P9-NEXT: vextsh2d v4, v4
; CHECK-P9-NEXT: lxvx v0, 0, r5
; CHECK-P9-NEXT: addis r5, r2, .LCPI7_3@toc@ha
; CHECK-P9-NEXT: xvcvsxddp vs0, v4
; CHECK-P9-NEXT: vperm v4, v2, v2, v5
; CHECK-P9-NEXT: addi r5, r5, .LCPI7_3@toc@l
; CHECK-P9-NEXT: lxvx v1, 0, r5
; CHECK-P9-NEXT: vextsh2d v4, v4
; CHECK-P9-NEXT: xvcvsxddp vs1, v4
; CHECK-P9-NEXT: vperm v4, v2, v2, v0
; CHECK-P9-NEXT: vperm v2, v2, v2, v1
; CHECK-P9-NEXT: vextsh2d v4, v4
; CHECK-P9-NEXT: xvcvsxddp vs2, v4
; CHECK-P9-NEXT: lxv v4, 16(r4)
; CHECK-P9-NEXT: vextsh2d v2, v2
; CHECK-P9-NEXT: xvcvsxddp vs3, v2
; CHECK-P9-NEXT: vperm v2, v4, v4, v3
; CHECK-P9-NEXT: vextsh2d v2, v2
; CHECK-P9-NEXT: stxv vs3, 48(r3)
; CHECK-P9-NEXT: xvcvsxddp vs4, v2
; CHECK-P9-NEXT: vperm v2, v4, v4, v5
; CHECK-P9-NEXT: vextsh2d v2, v2
; CHECK-P9-NEXT: xvcvsxddp vs5, v2
; CHECK-P9-NEXT: vperm v2, v4, v4, v0
; CHECK-P9-NEXT: stxv vs4, 64(r3)
; CHECK-P9-NEXT: stxv vs5, 80(r3)
; CHECK-P9-NEXT: vextsh2d v2, v2
; CHECK-P9-NEXT: xvcvsxddp vs6, v2
; CHECK-P9-NEXT: vperm v2, v4, v4, v1
; CHECK-P9-NEXT: vextsh2d v2, v2
; CHECK-P9-NEXT: stxv vs6, 96(r3)
; CHECK-P9-NEXT: xvcvsxddp vs7, v2
; CHECK-P9-NEXT: stxv vs7, 112(r3)
; CHECK-P9-NEXT: stxv vs2, 32(r3)
; CHECK-P9-NEXT: stxv vs1, 16(r3)
; CHECK-P9-NEXT: stxv vs0, 0(r3)
; CHECK-P9-NEXT: blr
;
; CHECK-BE-LABEL: test16elt_signed:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: addis r5, r2, .LCPI7_0@toc@ha
; CHECK-BE-NEXT: addi r5, r5, .LCPI7_0@toc@l
; CHECK-BE-NEXT: lxvx v2, 0, r5
-; CHECK-BE-NEXT: lxv v5, 0(r4)
-; CHECK-BE-NEXT: lxv v6, 16(r4)
+; CHECK-BE-NEXT: lxv v4, 0(r4)
+; CHECK-BE-NEXT: lxv v1, 16(r4)
; CHECK-BE-NEXT: addis r5, r2, .LCPI7_1@toc@ha
; CHECK-BE-NEXT: addi r5, r5, .LCPI7_1@toc@l
-; CHECK-BE-NEXT: addis r4, r2, .LCPI7_3@toc@ha
-; CHECK-BE-NEXT: xxlxor v0, v0, v0
-; CHECK-BE-NEXT: vperm v1, v0, v5, v2
+; CHECK-BE-NEXT: addis r4, r2, .LCPI7_2@toc@ha
+; CHECK-BE-NEXT: xxlxor v5, v5, v5
+; CHECK-BE-NEXT: vperm v0, v5, v4, v2
; CHECK-BE-NEXT: lxvx v3, 0, r5
-; CHECK-BE-NEXT: vperm v2, v0, v6, v2
-; CHECK-BE-NEXT: addis r5, r2, .LCPI7_2@toc@ha
-; CHECK-BE-NEXT: addi r5, r5, .LCPI7_2@toc@l
-; CHECK-BE-NEXT: addi r4, r4, .LCPI7_3@toc@l
-; CHECK-BE-NEXT: vextsh2d v2, v2
-; CHECK-BE-NEXT: lxvx v4, 0, r5
-; CHECK-BE-NEXT: vextsh2d v1, v1
-; CHECK-BE-NEXT: xvcvsxddp vs3, v2
-; CHECK-BE-NEXT: vperm v2, v0, v6, v3
-; CHECK-BE-NEXT: xvcvsxddp vs0, v1
-; CHECK-BE-NEXT: vperm v1, v0, v5, v3
+; CHECK-BE-NEXT: vperm v2, v5, v1, v2
; CHECK-BE-NEXT: vextsh2d v2, v2
-; CHECK-BE-NEXT: xvcvsxddp vs4, v2
-; CHECK-BE-NEXT: vperm v2, v0, v6, v4
-; CHECK-BE-NEXT: vextsh2d v1, v1
-; CHECK-BE-NEXT: xvcvsxddp vs1, v1
-; CHECK-BE-NEXT: vperm v1, v0, v5, v4
-; CHECK-BE-NEXT: stxv vs3, 80(r3)
+; CHECK-BE-NEXT: addi r4, r4, .LCPI7_2@toc@l
+; CHECK-BE-NEXT: vextsh2d v0, v0
+; CHECK-BE-NEXT: xvcvsxddp vs2, v2
+; CHECK-BE-NEXT: vperm v2, v5, v1, v3
; CHECK-BE-NEXT: vextsh2d v2, v2
-; CHECK-BE-NEXT: xvcvsxddp vs5, v2
+; CHECK-BE-NEXT: stxv vs2, 80(r3)
+; CHECK-BE-NEXT: xvcvsxddp vs3, v2
; CHECK-BE-NEXT: lxvx v2, 0, r4
-; CHECK-BE-NEXT: vperm v3, v5, v5, v2
-; CHECK-BE-NEXT: vperm v2, v6, v6, v2
-; CHECK-BE-NEXT: vextsh2d v1, v1
-; CHECK-BE-NEXT: stxv vs4, 96(r3)
+; CHECK-BE-NEXT: xvcvsxddp vs0, v0
+; CHECK-BE-NEXT: vperm v0, v5, v4, v3
+; CHECK-BE-NEXT: vperm v3, v4, v4, v2
+; CHECK-BE-NEXT: addis r4, r2, .LCPI7_3@toc@ha
+; CHECK-BE-NEXT: vextsh2d v0, v0
+; CHECK-BE-NEXT: xvcvsxddp vs1, v0
+; CHECK-BE-NEXT: stxv vs1, 48(r3)
; CHECK-BE-NEXT: vextsh2d v3, v3
+; CHECK-BE-NEXT: addi r4, r4, .LCPI7_3@toc@l
+; CHECK-BE-NEXT: xvcvsxddp vs4, v3
+; CHECK-BE-NEXT: lxvx v3, 0, r4
+; CHECK-BE-NEXT: vperm v2, v1, v1, v2
+; CHECK-BE-NEXT: vextsh2d v2, v2
+; CHECK-BE-NEXT: xvcvsxddp vs6, v2
+; CHECK-BE-NEXT: vperm v2, v1, v1, v3
+; CHECK-BE-NEXT: vperm v4, v4, v4, v3
+; CHECK-BE-NEXT: vextsh2d v4, v4
; CHECK-BE-NEXT: vextsh2d v2, v2
-; CHECK-BE-NEXT: xvcvsxddp vs2, v1
-; CHECK-BE-NEXT: stxv vs2, 48(r3)
-; CHECK-BE-NEXT: stxv vs5, 112(r3)
-; CHECK-BE-NEXT: xvcvsxddp vs6, v3
; CHECK-BE-NEXT: xvcvsxddp vs7, v2
-; CHECK-BE-NEXT: stxv vs7, 64(r3)
-; CHECK-BE-NEXT: stxv vs1, 32(r3)
+; CHECK-BE-NEXT: xvcvsxddp vs5, v4
+; CHECK-BE-NEXT: stxv vs3, 112(r3)
+; CHECK-BE-NEXT: stxv vs6, 64(r3)
; CHECK-BE-NEXT: stxv vs0, 16(r3)
-; CHECK-BE-NEXT: stxv vs6, 0(r3)
+; CHECK-BE-NEXT: stxv vs4, 0(r3)
+; CHECK-BE-NEXT: stxv vs7, 96(r3)
+; CHECK-BE-NEXT: stxv vs5, 32(r3)
; CHECK-BE-NEXT: blr
entry:
%a = load <16 x i16>, <16 x i16>* %0, align 32
%1 = sitofp <16 x i16> %a to <16 x double>
store <16 x double> %1, <16 x double>* %agg.result, align 128
ret void
}
Index: llvm/trunk/test/CodeGen/PowerPC/vec_conv_i8_to_fp32_elts.ll
===================================================================
--- llvm/trunk/test/CodeGen/PowerPC/vec_conv_i8_to_fp32_elts.ll (revision 357115)
+++ llvm/trunk/test/CodeGen/PowerPC/vec_conv_i8_to_fp32_elts.ll (revision 357116)
@@ -1,573 +1,573 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
; RUN: -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
; RUN: FileCheck %s --check-prefix=CHECK-P8
; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
; RUN: FileCheck %s --check-prefix=CHECK-P9
; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
; RUN: FileCheck %s --check-prefix=CHECK-BE
define i64 @test2elt(i16 %a.coerce) local_unnamed_addr #0 {
; CHECK-P8-LABEL: test2elt:
; CHECK-P8: # %bb.0: # %entry
; CHECK-P8-NEXT: mtvsrd f0, r3
; CHECK-P8-NEXT: mfvsrd r3, f0
; CHECK-P8-NEXT: clrldi r4, r3, 56
; CHECK-P8-NEXT: rldicl r3, r3, 56, 56
; CHECK-P8-NEXT: rlwinm r4, r4, 0, 24, 31
; CHECK-P8-NEXT: rlwinm r3, r3, 0, 24, 31
; CHECK-P8-NEXT: mtvsrwz f0, r4
; CHECK-P8-NEXT: mtvsrwz f1, r3
; CHECK-P8-NEXT: xscvuxdsp f0, f0
; CHECK-P8-NEXT: xscvuxdsp f1, f1
; CHECK-P8-NEXT: xscvdpspn vs0, f0
; CHECK-P8-NEXT: xscvdpspn vs1, f1
; CHECK-P8-NEXT: xxsldwi v2, vs0, vs0, 1
; CHECK-P8-NEXT: xxsldwi v3, vs1, vs1, 1
; CHECK-P8-NEXT: vmrglw v2, v3, v2
; CHECK-P8-NEXT: xxswapd vs0, v2
; CHECK-P8-NEXT: mfvsrd r3, f0
; CHECK-P8-NEXT: blr
;
; CHECK-P9-LABEL: test2elt:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: mtvsrws v2, r3
; CHECK-P9-NEXT: li r3, 0
; CHECK-P9-NEXT: vextubrx r3, r3, v2
; CHECK-P9-NEXT: rlwinm r3, r3, 0, 24, 31
; CHECK-P9-NEXT: mtvsrwz f0, r3
; CHECK-P9-NEXT: li r3, 1
; CHECK-P9-NEXT: xscvuxdsp f0, f0
; CHECK-P9-NEXT: xscvdpspn vs0, f0
; CHECK-P9-NEXT: vextubrx r3, r3, v2
; CHECK-P9-NEXT: rlwinm r3, r3, 0, 24, 31
; CHECK-P9-NEXT: xxsldwi v3, vs0, vs0, 1
; CHECK-P9-NEXT: mtvsrwz f0, r3
; CHECK-P9-NEXT: xscvuxdsp f0, f0
; CHECK-P9-NEXT: xscvdpspn vs0, f0
; CHECK-P9-NEXT: xxsldwi v2, vs0, vs0, 1
; CHECK-P9-NEXT: vmrglw v2, v2, v3
; CHECK-P9-NEXT: mfvsrld r3, v2
; CHECK-P9-NEXT: blr
;
; CHECK-BE-LABEL: test2elt:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: mtvsrws v2, r3
; CHECK-BE-NEXT: li r3, 1
; CHECK-BE-NEXT: vextublx r3, r3, v2
; CHECK-BE-NEXT: rlwinm r3, r3, 0, 24, 31
; CHECK-BE-NEXT: mtvsrwz f0, r3
; CHECK-BE-NEXT: li r3, 0
; CHECK-BE-NEXT: xscvuxdsp f0, f0
; CHECK-BE-NEXT: vextublx r3, r3, v2
; CHECK-BE-NEXT: rlwinm r3, r3, 0, 24, 31
; CHECK-BE-NEXT: xscvdpspn v3, f0
; CHECK-BE-NEXT: mtvsrwz f0, r3
; CHECK-BE-NEXT: xscvuxdsp f0, f0
; CHECK-BE-NEXT: xscvdpspn v2, f0
; CHECK-BE-NEXT: vmrghw v2, v2, v3
; CHECK-BE-NEXT: mfvsrd r3, v2
; CHECK-BE-NEXT: blr
entry:
%0 = bitcast i16 %a.coerce to <2 x i8>
%1 = uitofp <2 x i8> %0 to <2 x float>
%2 = bitcast <2 x float> %1 to i64
ret i64 %2
}
define <4 x float> @test4elt(i32 %a.coerce) local_unnamed_addr #1 {
; CHECK-P8-LABEL: test4elt:
; CHECK-P8: # %bb.0: # %entry
; CHECK-P8-NEXT: addis r4, r2, .LCPI1_0@toc@ha
; CHECK-P8-NEXT: mtvsrd f0, r3
; CHECK-P8-NEXT: addi r3, r4, .LCPI1_0@toc@l
; CHECK-P8-NEXT: xxlxor v4, v4, v4
; CHECK-P8-NEXT: xxswapd v2, vs0
; CHECK-P8-NEXT: lvx v3, 0, r3
; CHECK-P8-NEXT: vperm v2, v4, v2, v3
; CHECK-P8-NEXT: xvcvuxwsp v2, v2
; CHECK-P8-NEXT: blr
;
; CHECK-P9-LABEL: test4elt:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: mtvsrws v2, r3
; CHECK-P9-NEXT: addis r3, r2, .LCPI1_0@toc@ha
; CHECK-P9-NEXT: addi r3, r3, .LCPI1_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r3
; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: vperm v2, v4, v2, v3
; CHECK-P9-NEXT: xvcvuxwsp v2, v2
; CHECK-P9-NEXT: blr
;
; CHECK-BE-LABEL: test4elt:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: mtvsrws v2, r3
; CHECK-BE-NEXT: addis r3, r2, .LCPI1_0@toc@ha
; CHECK-BE-NEXT: addi r3, r3, .LCPI1_0@toc@l
; CHECK-BE-NEXT: lxvx v3, 0, r3
; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: vperm v2, v2, v4, v3
; CHECK-BE-NEXT: xvcvuxwsp v2, v2
; CHECK-BE-NEXT: blr
entry:
%0 = bitcast i32 %a.coerce to <4 x i8>
%1 = uitofp <4 x i8> %0 to <4 x float>
ret <4 x float> %1
}
define void @test8elt(<8 x float>* noalias nocapture sret %agg.result, i64 %a.coerce) local_unnamed_addr #2 {
; CHECK-P8-LABEL: test8elt:
; CHECK-P8: # %bb.0: # %entry
; CHECK-P8-NEXT: addis r5, r2, .LCPI2_0@toc@ha
; CHECK-P8-NEXT: mtvsrd f0, r4
; CHECK-P8-NEXT: addis r4, r2, .LCPI2_1@toc@ha
; CHECK-P8-NEXT: addi r5, r5, .LCPI2_0@toc@l
; CHECK-P8-NEXT: addi r4, r4, .LCPI2_1@toc@l
; CHECK-P8-NEXT: xxlxor v4, v4, v4
; CHECK-P8-NEXT: lvx v2, 0, r5
; CHECK-P8-NEXT: xxswapd v3, vs0
; CHECK-P8-NEXT: lvx v5, 0, r4
; CHECK-P8-NEXT: li r4, 16
; CHECK-P8-NEXT: vperm v2, v4, v3, v2
; CHECK-P8-NEXT: vperm v3, v4, v3, v5
; CHECK-P8-NEXT: xvcvuxwsp v2, v2
; CHECK-P8-NEXT: xvcvuxwsp v3, v3
; CHECK-P8-NEXT: stvx v2, 0, r3
; CHECK-P8-NEXT: stvx v3, r3, r4
; CHECK-P8-NEXT: blr
;
; CHECK-P9-LABEL: test8elt:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: mtvsrd f0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI2_0@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI2_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: xxswapd v2, vs0
; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: addis r4, r2, .LCPI2_1@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI2_1@toc@l
; CHECK-P9-NEXT: vperm v3, v4, v2, v3
; CHECK-P9-NEXT: xvcvuxwsp vs0, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: vperm v2, v4, v2, v3
; CHECK-P9-NEXT: stxv vs0, 0(r3)
; CHECK-P9-NEXT: xvcvuxwsp vs1, v2
; CHECK-P9-NEXT: stxv vs1, 16(r3)
; CHECK-P9-NEXT: blr
;
; CHECK-BE-LABEL: test8elt:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: mtvsrd v2, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI2_0@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI2_0@toc@l
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: addis r4, r2, .LCPI2_1@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI2_1@toc@l
; CHECK-BE-NEXT: vperm v3, v2, v4, v3
; CHECK-BE-NEXT: xvcvuxwsp vs0, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: vperm v2, v4, v2, v3
; CHECK-BE-NEXT: stxv vs0, 0(r3)
; CHECK-BE-NEXT: xvcvuxwsp vs1, v2
; CHECK-BE-NEXT: stxv vs1, 16(r3)
; CHECK-BE-NEXT: blr
entry:
%0 = bitcast i64 %a.coerce to <8 x i8>
%1 = uitofp <8 x i8> %0 to <8 x float>
store <8 x float> %1, <8 x float>* %agg.result, align 32
ret void
}
define void @test16elt(<16 x float>* noalias nocapture sret %agg.result, <16 x i8> %a) local_unnamed_addr #3 {
; CHECK-P8-LABEL: test16elt:
; CHECK-P8: # %bb.0: # %entry
; CHECK-P8-NEXT: addis r4, r2, .LCPI3_0@toc@ha
; CHECK-P8-NEXT: addis r5, r2, .LCPI3_2@toc@ha
; CHECK-P8-NEXT: xxlxor v4, v4, v4
; CHECK-P8-NEXT: addi r4, r4, .LCPI3_0@toc@l
; CHECK-P8-NEXT: addi r5, r5, .LCPI3_2@toc@l
; CHECK-P8-NEXT: lvx v3, 0, r4
; CHECK-P8-NEXT: addis r4, r2, .LCPI3_3@toc@ha
; CHECK-P8-NEXT: lvx v5, 0, r5
; CHECK-P8-NEXT: addis r5, r2, .LCPI3_1@toc@ha
; CHECK-P8-NEXT: addi r4, r4, .LCPI3_3@toc@l
; CHECK-P8-NEXT: addi r5, r5, .LCPI3_1@toc@l
; CHECK-P8-NEXT: lvx v0, 0, r4
; CHECK-P8-NEXT: lvx v1, 0, r5
; CHECK-P8-NEXT: li r4, 48
; CHECK-P8-NEXT: li r5, 32
; CHECK-P8-NEXT: vperm v5, v4, v2, v5
; CHECK-P8-NEXT: vperm v3, v4, v2, v3
; CHECK-P8-NEXT: vperm v0, v4, v2, v0
; CHECK-P8-NEXT: vperm v2, v4, v2, v1
; CHECK-P8-NEXT: xvcvuxwsp v4, v5
; CHECK-P8-NEXT: xvcvuxwsp v3, v3
; CHECK-P8-NEXT: xvcvuxwsp v5, v0
; CHECK-P8-NEXT: xvcvuxwsp v2, v2
; CHECK-P8-NEXT: stvx v4, r3, r5
; CHECK-P8-NEXT: stvx v3, 0, r3
; CHECK-P8-NEXT: stvx v5, r3, r4
; CHECK-P8-NEXT: li r4, 16
; CHECK-P8-NEXT: stvx v2, r3, r4
; CHECK-P8-NEXT: blr
;
; CHECK-P9-LABEL: test16elt:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: addis r4, r2, .LCPI3_0@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI3_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: addis r4, r2, .LCPI3_1@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI3_1@toc@l
; CHECK-P9-NEXT: vperm v3, v4, v2, v3
; CHECK-P9-NEXT: xvcvuxwsp vs0, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI3_2@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI3_2@toc@l
; CHECK-P9-NEXT: vperm v3, v4, v2, v3
; CHECK-P9-NEXT: stxv vs0, 0(r3)
; CHECK-P9-NEXT: xvcvuxwsp vs1, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI3_3@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI3_3@toc@l
; CHECK-P9-NEXT: vperm v3, v4, v2, v3
; CHECK-P9-NEXT: stxv vs1, 16(r3)
; CHECK-P9-NEXT: xvcvuxwsp vs2, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: vperm v2, v4, v2, v3
; CHECK-P9-NEXT: stxv vs2, 32(r3)
; CHECK-P9-NEXT: xvcvuxwsp vs3, v2
; CHECK-P9-NEXT: stxv vs3, 48(r3)
; CHECK-P9-NEXT: blr
;
; CHECK-BE-LABEL: test16elt:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: addis r4, r2, .LCPI3_0@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI3_0@toc@l
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: addis r4, r2, .LCPI3_1@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI3_1@toc@l
; CHECK-BE-NEXT: vperm v3, v2, v4, v3
; CHECK-BE-NEXT: xvcvuxwsp vs0, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI3_2@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI3_2@toc@l
; CHECK-BE-NEXT: vperm v3, v4, v2, v3
; CHECK-BE-NEXT: stxv vs0, 0(r3)
; CHECK-BE-NEXT: xvcvuxwsp vs1, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI3_3@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI3_3@toc@l
; CHECK-BE-NEXT: vperm v3, v4, v2, v3
; CHECK-BE-NEXT: stxv vs1, 16(r3)
; CHECK-BE-NEXT: xvcvuxwsp vs2, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: vperm v2, v4, v2, v3
; CHECK-BE-NEXT: stxv vs2, 32(r3)
; CHECK-BE-NEXT: xvcvuxwsp vs3, v2
; CHECK-BE-NEXT: stxv vs3, 48(r3)
; CHECK-BE-NEXT: blr
entry:
%0 = uitofp <16 x i8> %a to <16 x float>
store <16 x float> %0, <16 x float>* %agg.result, align 64
ret void
}
define i64 @test2elt_signed(i16 %a.coerce) local_unnamed_addr #0 {
; CHECK-P8-LABEL: test2elt_signed:
; CHECK-P8: # %bb.0: # %entry
; CHECK-P8-NEXT: mtvsrd f0, r3
; CHECK-P8-NEXT: mfvsrd r3, f0
; CHECK-P8-NEXT: clrldi r4, r3, 56
; CHECK-P8-NEXT: rldicl r3, r3, 56, 56
; CHECK-P8-NEXT: extsb r4, r4
; CHECK-P8-NEXT: extsb r3, r3
; CHECK-P8-NEXT: mtvsrwa f0, r4
; CHECK-P8-NEXT: mtvsrwa f1, r3
; CHECK-P8-NEXT: xscvsxdsp f0, f0
; CHECK-P8-NEXT: xscvsxdsp f1, f1
; CHECK-P8-NEXT: xscvdpspn vs0, f0
; CHECK-P8-NEXT: xscvdpspn vs1, f1
; CHECK-P8-NEXT: xxsldwi v2, vs0, vs0, 1
; CHECK-P8-NEXT: xxsldwi v3, vs1, vs1, 1
; CHECK-P8-NEXT: vmrglw v2, v3, v2
; CHECK-P8-NEXT: xxswapd vs0, v2
; CHECK-P8-NEXT: mfvsrd r3, f0
; CHECK-P8-NEXT: blr
;
; CHECK-P9-LABEL: test2elt_signed:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: mtvsrws v2, r3
; CHECK-P9-NEXT: li r3, 0
; CHECK-P9-NEXT: vextubrx r3, r3, v2
; CHECK-P9-NEXT: extsb r3, r3
; CHECK-P9-NEXT: mtvsrwa f0, r3
; CHECK-P9-NEXT: li r3, 1
; CHECK-P9-NEXT: xscvsxdsp f0, f0
; CHECK-P9-NEXT: xscvdpspn vs0, f0
; CHECK-P9-NEXT: vextubrx r3, r3, v2
; CHECK-P9-NEXT: extsb r3, r3
; CHECK-P9-NEXT: xxsldwi v3, vs0, vs0, 1
; CHECK-P9-NEXT: mtvsrwa f0, r3
; CHECK-P9-NEXT: xscvsxdsp f0, f0
; CHECK-P9-NEXT: xscvdpspn vs0, f0
; CHECK-P9-NEXT: xxsldwi v2, vs0, vs0, 1
; CHECK-P9-NEXT: vmrglw v2, v2, v3
; CHECK-P9-NEXT: mfvsrld r3, v2
; CHECK-P9-NEXT: blr
;
; CHECK-BE-LABEL: test2elt_signed:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: mtvsrws v2, r3
; CHECK-BE-NEXT: li r3, 1
; CHECK-BE-NEXT: vextublx r3, r3, v2
; CHECK-BE-NEXT: extsb r3, r3
; CHECK-BE-NEXT: mtvsrwa f0, r3
; CHECK-BE-NEXT: li r3, 0
; CHECK-BE-NEXT: xscvsxdsp f0, f0
; CHECK-BE-NEXT: vextublx r3, r3, v2
; CHECK-BE-NEXT: extsb r3, r3
; CHECK-BE-NEXT: xscvdpspn v3, f0
; CHECK-BE-NEXT: mtvsrwa f0, r3
; CHECK-BE-NEXT: xscvsxdsp f0, f0
; CHECK-BE-NEXT: xscvdpspn v2, f0
; CHECK-BE-NEXT: vmrghw v2, v2, v3
; CHECK-BE-NEXT: mfvsrd r3, v2
; CHECK-BE-NEXT: blr
entry:
%0 = bitcast i16 %a.coerce to <2 x i8>
%1 = sitofp <2 x i8> %0 to <2 x float>
%2 = bitcast <2 x float> %1 to i64
ret i64 %2
}
define <4 x float> @test4elt_signed(i32 %a.coerce) local_unnamed_addr #1 {
; CHECK-P8-LABEL: test4elt_signed:
; CHECK-P8: # %bb.0: # %entry
; CHECK-P8-NEXT: addis r4, r2, .LCPI5_0@toc@ha
; CHECK-P8-NEXT: mtvsrd f0, r3
; CHECK-P8-NEXT: addi r3, r4, .LCPI5_0@toc@l
; CHECK-P8-NEXT: xxswapd v2, vs0
; CHECK-P8-NEXT: lvx v3, 0, r3
; CHECK-P8-NEXT: vperm v2, v2, v2, v3
; CHECK-P8-NEXT: vspltisw v3, 12
; CHECK-P8-NEXT: vadduwm v3, v3, v3
; CHECK-P8-NEXT: vslw v2, v2, v3
; CHECK-P8-NEXT: vsraw v2, v2, v3
; CHECK-P8-NEXT: xvcvsxwsp v2, v2
; CHECK-P8-NEXT: blr
;
; CHECK-P9-LABEL: test4elt_signed:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: mtvsrws v2, r3
; CHECK-P9-NEXT: addis r3, r2, .LCPI5_0@toc@ha
; CHECK-P9-NEXT: addi r3, r3, .LCPI5_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r3
; CHECK-P9-NEXT: vperm v2, v2, v2, v3
; CHECK-P9-NEXT: vextsb2w v2, v2
; CHECK-P9-NEXT: xvcvsxwsp v2, v2
; CHECK-P9-NEXT: blr
;
; CHECK-BE-LABEL: test4elt_signed:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: mtvsrws v2, r3
; CHECK-BE-NEXT: addis r3, r2, .LCPI5_0@toc@ha
; CHECK-BE-NEXT: addi r3, r3, .LCPI5_0@toc@l
; CHECK-BE-NEXT: lxvx v3, 0, r3
; CHECK-BE-NEXT: vperm v2, v2, v2, v3
; CHECK-BE-NEXT: vextsb2w v2, v2
; CHECK-BE-NEXT: xvcvsxwsp v2, v2
; CHECK-BE-NEXT: blr
entry:
%0 = bitcast i32 %a.coerce to <4 x i8>
%1 = sitofp <4 x i8> %0 to <4 x float>
ret <4 x float> %1
}
define void @test8elt_signed(<8 x float>* noalias nocapture sret %agg.result, i64 %a.coerce) local_unnamed_addr #2 {
; CHECK-P8-LABEL: test8elt_signed:
; CHECK-P8: # %bb.0: # %entry
; CHECK-P8-NEXT: addis r5, r2, .LCPI6_0@toc@ha
; CHECK-P8-NEXT: mtvsrd f0, r4
; CHECK-P8-NEXT: addis r4, r2, .LCPI6_1@toc@ha
; CHECK-P8-NEXT: vspltisw v5, 12
; CHECK-P8-NEXT: addi r5, r5, .LCPI6_0@toc@l
; CHECK-P8-NEXT: addi r4, r4, .LCPI6_1@toc@l
; CHECK-P8-NEXT: lvx v2, 0, r5
; CHECK-P8-NEXT: xxswapd v3, vs0
; CHECK-P8-NEXT: lvx v4, 0, r4
; CHECK-P8-NEXT: li r4, 16
; CHECK-P8-NEXT: vperm v2, v3, v3, v2
; CHECK-P8-NEXT: vperm v3, v3, v3, v4
; CHECK-P8-NEXT: vadduwm v4, v5, v5
; CHECK-P8-NEXT: vslw v2, v2, v4
; CHECK-P8-NEXT: vslw v3, v3, v4
; CHECK-P8-NEXT: vsraw v2, v2, v4
; CHECK-P8-NEXT: vsraw v3, v3, v4
; CHECK-P8-NEXT: xvcvsxwsp v2, v2
; CHECK-P8-NEXT: xvcvsxwsp v3, v3
; CHECK-P8-NEXT: stvx v2, 0, r3
; CHECK-P8-NEXT: stvx v3, r3, r4
; CHECK-P8-NEXT: blr
;
; CHECK-P9-LABEL: test8elt_signed:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: mtvsrd f0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI6_0@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI6_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: xxswapd v2, vs0
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: addis r4, r2, .LCPI6_1@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI6_1@toc@l
; CHECK-P9-NEXT: vextsb2w v3, v3
; CHECK-P9-NEXT: xvcvsxwsp vs0, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: vperm v2, v2, v2, v3
; CHECK-P9-NEXT: stxv vs0, 0(r3)
; CHECK-P9-NEXT: vextsb2w v2, v2
; CHECK-P9-NEXT: xvcvsxwsp vs1, v2
; CHECK-P9-NEXT: stxv vs1, 16(r3)
; CHECK-P9-NEXT: blr
;
; CHECK-BE-LABEL: test8elt_signed:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: mtvsrd v2, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI6_0@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI6_0@toc@l
; CHECK-BE-NEXT: lxvx v4, 0, r4
; CHECK-BE-NEXT: xxlxor v3, v3, v3
; CHECK-BE-NEXT: vperm v3, v3, v2, v4
; CHECK-BE-NEXT: addis r4, r2, .LCPI6_1@toc@ha
; CHECK-BE-NEXT: vextsb2w v3, v3
; CHECK-BE-NEXT: addi r4, r4, .LCPI6_1@toc@l
; CHECK-BE-NEXT: xvcvsxwsp vs0, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: vperm v2, v2, v2, v3
; CHECK-BE-NEXT: stxv vs0, 16(r3)
; CHECK-BE-NEXT: vextsb2w v2, v2
; CHECK-BE-NEXT: xvcvsxwsp vs1, v2
; CHECK-BE-NEXT: stxv vs1, 0(r3)
; CHECK-BE-NEXT: blr
entry:
%0 = bitcast i64 %a.coerce to <8 x i8>
%1 = sitofp <8 x i8> %0 to <8 x float>
store <8 x float> %1, <8 x float>* %agg.result, align 32
ret void
}
define void @test16elt_signed(<16 x float>* noalias nocapture sret %agg.result, <16 x i8> %a) local_unnamed_addr #3 {
; CHECK-P8-LABEL: test16elt_signed:
; CHECK-P8: # %bb.0: # %entry
; CHECK-P8-NEXT: addis r4, r2, .LCPI7_0@toc@ha
; CHECK-P8-NEXT: addis r5, r2, .LCPI7_2@toc@ha
; CHECK-P8-NEXT: vspltisw v1, 12
; CHECK-P8-NEXT: addi r4, r4, .LCPI7_0@toc@l
; CHECK-P8-NEXT: addi r5, r5, .LCPI7_2@toc@l
; CHECK-P8-NEXT: lvx v3, 0, r4
; CHECK-P8-NEXT: addis r4, r2, .LCPI7_3@toc@ha
; CHECK-P8-NEXT: lvx v4, 0, r5
; CHECK-P8-NEXT: addis r5, r2, .LCPI7_1@toc@ha
; CHECK-P8-NEXT: addi r4, r4, .LCPI7_3@toc@l
; CHECK-P8-NEXT: addi r5, r5, .LCPI7_1@toc@l
; CHECK-P8-NEXT: lvx v5, 0, r4
; CHECK-P8-NEXT: lvx v0, 0, r5
; CHECK-P8-NEXT: li r4, 48
; CHECK-P8-NEXT: li r5, 32
; CHECK-P8-NEXT: vperm v3, v2, v2, v3
; CHECK-P8-NEXT: vperm v4, v2, v2, v4
; CHECK-P8-NEXT: vperm v5, v2, v2, v5
; CHECK-P8-NEXT: vperm v2, v2, v2, v0
; CHECK-P8-NEXT: vadduwm v0, v1, v1
; CHECK-P8-NEXT: vslw v3, v3, v0
; CHECK-P8-NEXT: vslw v4, v4, v0
; CHECK-P8-NEXT: vslw v5, v5, v0
; CHECK-P8-NEXT: vslw v2, v2, v0
; CHECK-P8-NEXT: vsraw v3, v3, v0
; CHECK-P8-NEXT: vsraw v4, v4, v0
; CHECK-P8-NEXT: vsraw v5, v5, v0
; CHECK-P8-NEXT: vsraw v2, v2, v0
; CHECK-P8-NEXT: xvcvsxwsp v3, v3
; CHECK-P8-NEXT: xvcvsxwsp v4, v4
; CHECK-P8-NEXT: xvcvsxwsp v5, v5
; CHECK-P8-NEXT: xvcvsxwsp v2, v2
; CHECK-P8-NEXT: stvx v3, 0, r3
; CHECK-P8-NEXT: stvx v4, r3, r5
; CHECK-P8-NEXT: stvx v5, r3, r4
; CHECK-P8-NEXT: li r4, 16
; CHECK-P8-NEXT: stvx v2, r3, r4
; CHECK-P8-NEXT: blr
;
; CHECK-P9-LABEL: test16elt_signed:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: addis r4, r2, .LCPI7_0@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI7_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI7_1@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI7_1@toc@l
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: vextsb2w v3, v3
; CHECK-P9-NEXT: xvcvsxwsp vs0, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI7_2@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI7_2@toc@l
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: stxv vs0, 0(r3)
; CHECK-P9-NEXT: vextsb2w v3, v3
; CHECK-P9-NEXT: xvcvsxwsp vs1, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI7_3@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI7_3@toc@l
; CHECK-P9-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-NEXT: stxv vs1, 16(r3)
; CHECK-P9-NEXT: vextsb2w v3, v3
; CHECK-P9-NEXT: xvcvsxwsp vs2, v3
; CHECK-P9-NEXT: lxvx v3, 0, r4
; CHECK-P9-NEXT: vperm v2, v2, v2, v3
; CHECK-P9-NEXT: stxv vs2, 32(r3)
; CHECK-P9-NEXT: vextsb2w v2, v2
; CHECK-P9-NEXT: xvcvsxwsp vs3, v2
; CHECK-P9-NEXT: stxv vs3, 48(r3)
; CHECK-P9-NEXT: blr
;
; CHECK-BE-LABEL: test16elt_signed:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: addis r4, r2, .LCPI7_0@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI7_0@toc@l
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: vperm v3, v4, v2, v3
; CHECK-BE-NEXT: addis r4, r2, .LCPI7_1@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI7_1@toc@l
; CHECK-BE-NEXT: vextsb2w v3, v3
; CHECK-BE-NEXT: xvcvsxwsp vs0, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI7_2@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI7_2@toc@l
; CHECK-BE-NEXT: vperm v3, v4, v2, v3
; CHECK-BE-NEXT: stxv vs0, 16(r3)
; CHECK-BE-NEXT: vextsb2w v3, v3
; CHECK-BE-NEXT: xvcvsxwsp vs1, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI7_3@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI7_3@toc@l
-; CHECK-BE-NEXT: vperm v3, v4, v2, v3
-; CHECK-BE-NEXT: stxv vs1, 32(r3)
+; CHECK-BE-NEXT: vperm v3, v2, v2, v3
+; CHECK-BE-NEXT: stxv vs1, 48(r3)
; CHECK-BE-NEXT: vextsb2w v3, v3
; CHECK-BE-NEXT: xvcvsxwsp vs2, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: vperm v2, v2, v2, v3
-; CHECK-BE-NEXT: stxv vs2, 48(r3)
+; CHECK-BE-NEXT: stxv vs2, 0(r3)
; CHECK-BE-NEXT: vextsb2w v2, v2
; CHECK-BE-NEXT: xvcvsxwsp vs3, v2
-; CHECK-BE-NEXT: stxv vs3, 0(r3)
+; CHECK-BE-NEXT: stxv vs3, 32(r3)
; CHECK-BE-NEXT: blr
entry:
%0 = sitofp <16 x i8> %a to <16 x float>
store <16 x float> %0, <16 x float>* %agg.result, align 64
ret void
}
Index: llvm/trunk/test/CodeGen/X86/vec_minmax_sint.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/vec_minmax_sint.ll (revision 357115)
+++ llvm/trunk/test/CodeGen/X86/vec_minmax_sint.ll (revision 357116)
@@ -1,2036 +1,2050 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE42
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
;
; Signed Maximum (GT)
;
define <2 x i64> @max_gt_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: max_gt_v2i64:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
; SSE2-NEXT: pxor %xmm0, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: pand %xmm5, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm2, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm3
; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_gt_v2i64:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: pxor %xmm0, %xmm3
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_gt_v2i64:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm2
; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE42-NEXT: movapd %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: max_gt_v2i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: max_gt_v2i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: max_gt_v2i64:
; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = icmp sgt <2 x i64> %a, %b
%2 = select <2 x i1> %1, <2 x i64> %a, <2 x i64> %b
ret <2 x i64> %2
}
define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: max_gt_v4i64:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm2, %xmm5
; SSE2-NEXT: pxor %xmm4, %xmm5
; SSE2-NEXT: movdqa %xmm0, %xmm6
; SSE2-NEXT: pxor %xmm4, %xmm6
; SSE2-NEXT: movdqa %xmm6, %xmm7
; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
; SSE2-NEXT: pand %xmm8, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
; SSE2-NEXT: por %xmm5, %xmm6
; SSE2-NEXT: pand %xmm6, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm6
; SSE2-NEXT: por %xmm6, %xmm0
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm2
; SSE2-NEXT: pxor %xmm1, %xmm4
; SSE2-NEXT: movdqa %xmm4, %xmm5
; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm2, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
; SSE2-NEXT: por %xmm2, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pandn %xmm3, %xmm4
; SSE2-NEXT: por %xmm4, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_gt_v4i64:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm4
; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
; SSE41-NEXT: movdqa %xmm2, %xmm6
; SSE41-NEXT: pxor %xmm5, %xmm6
; SSE41-NEXT: movdqa %xmm0, %xmm7
; SSE41-NEXT: pxor %xmm5, %xmm7
; SSE41-NEXT: movdqa %xmm7, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm7, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm7
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE41-NEXT: pand %xmm8, %xmm6
+; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm5
; SSE41-NEXT: movdqa %xmm5, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: movapd %xmm3, %xmm1
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_gt_v4i64:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm4
; SSE42-NEXT: pcmpgtq %xmm2, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE42-NEXT: movdqa %xmm1, %xmm0
; SSE42-NEXT: pcmpgtq %xmm3, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
; SSE42-NEXT: movapd %xmm2, %xmm0
; SSE42-NEXT: movapd %xmm3, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: max_gt_v4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: max_gt_v4i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: max_gt_v4i64:
; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512-NEXT: retq
%1 = icmp sgt <4 x i64> %a, %b
%2 = select <4 x i1> %1, <4 x i64> %a, <4 x i64> %b
ret <4 x i64> %2
}
define <4 x i32> @max_gt_v4i32(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: max_gt_v4i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_gt_v4i32:
; SSE41: # %bb.0:
; SSE41-NEXT: pmaxsd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_gt_v4i32:
; SSE42: # %bb.0:
; SSE42-NEXT: pmaxsd %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: max_gt_v4i32:
; AVX: # %bb.0:
; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp sgt <4 x i32> %a, %b
%2 = select <4 x i1> %1, <4 x i32> %a, <4 x i32> %b
ret <4 x i32> %2
}
define <8 x i32> @max_gt_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: max_gt_v8i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm4
; SSE2-NEXT: por %xmm0, %xmm4
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pandn %xmm3, %xmm2
; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: movdqa %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_gt_v8i32:
; SSE41: # %bb.0:
; SSE41-NEXT: pmaxsd %xmm2, %xmm0
; SSE41-NEXT: pmaxsd %xmm3, %xmm1
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_gt_v8i32:
; SSE42: # %bb.0:
; SSE42-NEXT: pmaxsd %xmm2, %xmm0
; SSE42-NEXT: pmaxsd %xmm3, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: max_gt_v8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: max_gt_v8i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: max_gt_v8i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp sgt <8 x i32> %a, %b
%2 = select <8 x i1> %1, <8 x i32> %a, <8 x i32> %b
ret <8 x i32> %2
}
define <8 x i16> @max_gt_v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: max_gt_v8i16:
; SSE: # %bb.0:
; SSE-NEXT: pmaxsw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: max_gt_v8i16:
; AVX: # %bb.0:
; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp sgt <8 x i16> %a, %b
%2 = select <8 x i1> %1, <8 x i16> %a, <8 x i16> %b
ret <8 x i16> %2
}
define <16 x i16> @max_gt_v16i16(<16 x i16> %a, <16 x i16> %b) {
; SSE-LABEL: max_gt_v16i16:
; SSE: # %bb.0:
; SSE-NEXT: pmaxsw %xmm2, %xmm0
; SSE-NEXT: pmaxsw %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: max_gt_v16i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: max_gt_v16i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: max_gt_v16i16:
; AVX512: # %bb.0:
; AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp sgt <16 x i16> %a, %b
%2 = select <16 x i1> %1, <16 x i16> %a, <16 x i16> %b
ret <16 x i16> %2
}
define <16 x i8> @max_gt_v16i8(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: max_gt_v16i8:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_gt_v16i8:
; SSE41: # %bb.0:
; SSE41-NEXT: pmaxsb %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_gt_v16i8:
; SSE42: # %bb.0:
; SSE42-NEXT: pmaxsb %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: max_gt_v16i8:
; AVX: # %bb.0:
; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp sgt <16 x i8> %a, %b
%2 = select <16 x i1> %1, <16 x i8> %a, <16 x i8> %b
ret <16 x i8> %2
}
define <32 x i8> @max_gt_v32i8(<32 x i8> %a, <32 x i8> %b) {
; SSE2-LABEL: max_gt_v32i8:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: pcmpgtb %xmm2, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm4
; SSE2-NEXT: por %xmm0, %xmm4
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtb %xmm3, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pandn %xmm3, %xmm2
; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: movdqa %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_gt_v32i8:
; SSE41: # %bb.0:
; SSE41-NEXT: pmaxsb %xmm2, %xmm0
; SSE41-NEXT: pmaxsb %xmm3, %xmm1
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_gt_v32i8:
; SSE42: # %bb.0:
; SSE42-NEXT: pmaxsb %xmm2, %xmm0
; SSE42-NEXT: pmaxsb %xmm3, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: max_gt_v32i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: max_gt_v32i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: max_gt_v32i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp sgt <32 x i8> %a, %b
%2 = select <32 x i1> %1, <32 x i8> %a, <32 x i8> %b
ret <32 x i8> %2
}
;
; Signed Maximum (GE)
;
define <2 x i64> @max_ge_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: max_ge_v2i64:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
; SSE2-NEXT: pxor %xmm0, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: pand %xmm5, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm2, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm3
; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_ge_v2i64:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: pxor %xmm0, %xmm3
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_ge_v2i64:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm2
; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE42-NEXT: movapd %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: max_ge_v2i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: max_ge_v2i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: max_ge_v2i64:
; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = icmp sge <2 x i64> %a, %b
%2 = select <2 x i1> %1, <2 x i64> %a, <2 x i64> %b
ret <2 x i64> %2
}
define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: max_ge_v4i64:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm2, %xmm5
; SSE2-NEXT: pxor %xmm4, %xmm5
; SSE2-NEXT: movdqa %xmm0, %xmm6
; SSE2-NEXT: pxor %xmm4, %xmm6
; SSE2-NEXT: movdqa %xmm6, %xmm7
; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
; SSE2-NEXT: pand %xmm8, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
; SSE2-NEXT: por %xmm5, %xmm6
; SSE2-NEXT: pand %xmm6, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm6
; SSE2-NEXT: por %xmm6, %xmm0
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm2
; SSE2-NEXT: pxor %xmm1, %xmm4
; SSE2-NEXT: movdqa %xmm4, %xmm5
; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm2, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
; SSE2-NEXT: por %xmm2, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pandn %xmm3, %xmm4
; SSE2-NEXT: por %xmm4, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_ge_v4i64:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm4
; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
; SSE41-NEXT: movdqa %xmm2, %xmm6
; SSE41-NEXT: pxor %xmm5, %xmm6
; SSE41-NEXT: movdqa %xmm0, %xmm7
; SSE41-NEXT: pxor %xmm5, %xmm7
; SSE41-NEXT: movdqa %xmm7, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm7, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm7
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE41-NEXT: pand %xmm8, %xmm6
+; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm5
; SSE41-NEXT: movdqa %xmm5, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: movapd %xmm3, %xmm1
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_ge_v4i64:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm4
; SSE42-NEXT: pcmpgtq %xmm2, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE42-NEXT: movdqa %xmm1, %xmm0
; SSE42-NEXT: pcmpgtq %xmm3, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
; SSE42-NEXT: movapd %xmm2, %xmm0
; SSE42-NEXT: movapd %xmm3, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: max_ge_v4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: max_ge_v4i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: max_ge_v4i64:
; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512-NEXT: retq
%1 = icmp sge <4 x i64> %a, %b
%2 = select <4 x i1> %1, <4 x i64> %a, <4 x i64> %b
ret <4 x i64> %2
}
define <4 x i32> @max_ge_v4i32(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: max_ge_v4i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_ge_v4i32:
; SSE41: # %bb.0:
; SSE41-NEXT: pmaxsd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_ge_v4i32:
; SSE42: # %bb.0:
; SSE42-NEXT: pmaxsd %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: max_ge_v4i32:
; AVX: # %bb.0:
; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp sge <4 x i32> %a, %b
%2 = select <4 x i1> %1, <4 x i32> %a, <4 x i32> %b
ret <4 x i32> %2
}
define <8 x i32> @max_ge_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: max_ge_v8i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm4
; SSE2-NEXT: por %xmm0, %xmm4
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pandn %xmm3, %xmm2
; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: movdqa %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_ge_v8i32:
; SSE41: # %bb.0:
; SSE41-NEXT: pmaxsd %xmm2, %xmm0
; SSE41-NEXT: pmaxsd %xmm3, %xmm1
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_ge_v8i32:
; SSE42: # %bb.0:
; SSE42-NEXT: pmaxsd %xmm2, %xmm0
; SSE42-NEXT: pmaxsd %xmm3, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: max_ge_v8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: max_ge_v8i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: max_ge_v8i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp sge <8 x i32> %a, %b
%2 = select <8 x i1> %1, <8 x i32> %a, <8 x i32> %b
ret <8 x i32> %2
}
define <8 x i16> @max_ge_v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: max_ge_v8i16:
; SSE: # %bb.0:
; SSE-NEXT: pmaxsw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: max_ge_v8i16:
; AVX: # %bb.0:
; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp sge <8 x i16> %a, %b
%2 = select <8 x i1> %1, <8 x i16> %a, <8 x i16> %b
ret <8 x i16> %2
}
define <16 x i16> @max_ge_v16i16(<16 x i16> %a, <16 x i16> %b) {
; SSE-LABEL: max_ge_v16i16:
; SSE: # %bb.0:
; SSE-NEXT: pmaxsw %xmm2, %xmm0
; SSE-NEXT: pmaxsw %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: max_ge_v16i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: max_ge_v16i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: max_ge_v16i16:
; AVX512: # %bb.0:
; AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp sge <16 x i16> %a, %b
%2 = select <16 x i1> %1, <16 x i16> %a, <16 x i16> %b
ret <16 x i16> %2
}
define <16 x i8> @max_ge_v16i8(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: max_ge_v16i8:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_ge_v16i8:
; SSE41: # %bb.0:
; SSE41-NEXT: pmaxsb %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_ge_v16i8:
; SSE42: # %bb.0:
; SSE42-NEXT: pmaxsb %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: max_ge_v16i8:
; AVX: # %bb.0:
; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp sge <16 x i8> %a, %b
%2 = select <16 x i1> %1, <16 x i8> %a, <16 x i8> %b
ret <16 x i8> %2
}
define <32 x i8> @max_ge_v32i8(<32 x i8> %a, <32 x i8> %b) {
; SSE2-LABEL: max_ge_v32i8:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: pcmpgtb %xmm2, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm4
; SSE2-NEXT: por %xmm0, %xmm4
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtb %xmm3, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pandn %xmm3, %xmm2
; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: movdqa %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_ge_v32i8:
; SSE41: # %bb.0:
; SSE41-NEXT: pmaxsb %xmm2, %xmm0
; SSE41-NEXT: pmaxsb %xmm3, %xmm1
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_ge_v32i8:
; SSE42: # %bb.0:
; SSE42-NEXT: pmaxsb %xmm2, %xmm0
; SSE42-NEXT: pmaxsb %xmm3, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: max_ge_v32i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: max_ge_v32i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: max_ge_v32i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp sge <32 x i8> %a, %b
%2 = select <32 x i1> %1, <32 x i8> %a, <32 x i8> %b
ret <32 x i8> %2
}
;
; Signed Minimum (LT)
;
define <2 x i64> @min_lt_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: min_lt_v2i64:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
; SSE2-NEXT: pxor %xmm1, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: pand %xmm5, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm2, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm3
; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_lt_v2i64:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: pxor %xmm0, %xmm3
+; SSE41-NEXT: pxor %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_lt_v2i64:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm2
; SSE42-NEXT: movdqa %xmm1, %xmm0
; SSE42-NEXT: pcmpgtq %xmm2, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE42-NEXT: movapd %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: min_lt_v2i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: min_lt_v2i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: min_lt_v2i64:
; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = icmp slt <2 x i64> %a, %b
%2 = select <2 x i1> %1, <2 x i64> %a, <2 x i64> %b
ret <2 x i64> %2
}
define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: min_lt_v4i64:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: pxor %xmm4, %xmm5
; SSE2-NEXT: movdqa %xmm2, %xmm6
; SSE2-NEXT: pxor %xmm4, %xmm6
; SSE2-NEXT: movdqa %xmm6, %xmm7
; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
; SSE2-NEXT: pand %xmm8, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
; SSE2-NEXT: por %xmm5, %xmm6
; SSE2-NEXT: pand %xmm6, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm6
; SSE2-NEXT: por %xmm6, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm2
; SSE2-NEXT: pxor %xmm3, %xmm4
; SSE2-NEXT: movdqa %xmm4, %xmm5
; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm2, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
; SSE2-NEXT: por %xmm2, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pandn %xmm3, %xmm4
; SSE2-NEXT: por %xmm4, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_lt_v4i64:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm4
; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
; SSE41-NEXT: pxor %xmm5, %xmm0
; SSE41-NEXT: movdqa %xmm2, %xmm6
; SSE41-NEXT: pxor %xmm5, %xmm6
; SSE41-NEXT: movdqa %xmm6, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
+; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
; SSE41-NEXT: pxor %xmm3, %xmm5
; SSE41-NEXT: movdqa %xmm5, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: movapd %xmm3, %xmm1
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_lt_v4i64:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm4
; SSE42-NEXT: movdqa %xmm2, %xmm0
; SSE42-NEXT: pcmpgtq %xmm4, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE42-NEXT: movdqa %xmm3, %xmm0
; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
; SSE42-NEXT: movapd %xmm2, %xmm0
; SSE42-NEXT: movapd %xmm3, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: min_lt_v4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: min_lt_v4i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: min_lt_v4i64:
; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512-NEXT: retq
%1 = icmp slt <4 x i64> %a, %b
%2 = select <4 x i1> %1, <4 x i64> %a, <4 x i64> %b
ret <4 x i64> %2
}
define <4 x i32> @min_lt_v4i32(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: min_lt_v4i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_lt_v4i32:
; SSE41: # %bb.0:
; SSE41-NEXT: pminsd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_lt_v4i32:
; SSE42: # %bb.0:
; SSE42-NEXT: pminsd %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: min_lt_v4i32:
; AVX: # %bb.0:
; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp slt <4 x i32> %a, %b
%2 = select <4 x i1> %1, <4 x i32> %a, <4 x i32> %b
ret <4 x i32> %2
}
define <8 x i32> @min_lt_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: min_lt_v8i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm4
; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pandn %xmm3, %xmm2
; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_lt_v8i32:
; SSE41: # %bb.0:
; SSE41-NEXT: pminsd %xmm2, %xmm0
; SSE41-NEXT: pminsd %xmm3, %xmm1
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_lt_v8i32:
; SSE42: # %bb.0:
; SSE42-NEXT: pminsd %xmm2, %xmm0
; SSE42-NEXT: pminsd %xmm3, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: min_lt_v8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: min_lt_v8i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: min_lt_v8i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp slt <8 x i32> %a, %b
%2 = select <8 x i1> %1, <8 x i32> %a, <8 x i32> %b
ret <8 x i32> %2
}
define <8 x i16> @min_lt_v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: min_lt_v8i16:
; SSE: # %bb.0:
; SSE-NEXT: pminsw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: min_lt_v8i16:
; AVX: # %bb.0:
; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp slt <8 x i16> %a, %b
%2 = select <8 x i1> %1, <8 x i16> %a, <8 x i16> %b
ret <8 x i16> %2
}
define <16 x i16> @min_lt_v16i16(<16 x i16> %a, <16 x i16> %b) {
; SSE-LABEL: min_lt_v16i16:
; SSE: # %bb.0:
; SSE-NEXT: pminsw %xmm2, %xmm0
; SSE-NEXT: pminsw %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: min_lt_v16i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: min_lt_v16i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: min_lt_v16i16:
; AVX512: # %bb.0:
; AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp slt <16 x i16> %a, %b
%2 = select <16 x i1> %1, <16 x i16> %a, <16 x i16> %b
ret <16 x i16> %2
}
define <16 x i8> @min_lt_v16i8(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: min_lt_v16i8:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_lt_v16i8:
; SSE41: # %bb.0:
; SSE41-NEXT: pminsb %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_lt_v16i8:
; SSE42: # %bb.0:
; SSE42-NEXT: pminsb %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: min_lt_v16i8:
; AVX: # %bb.0:
; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp slt <16 x i8> %a, %b
%2 = select <16 x i1> %1, <16 x i8> %a, <16 x i8> %b
ret <16 x i8> %2
}
define <32 x i8> @min_lt_v32i8(<32 x i8> %a, <32 x i8> %b) {
; SSE2-LABEL: min_lt_v32i8:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: pcmpgtb %xmm0, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm4
; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pandn %xmm3, %xmm2
; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_lt_v32i8:
; SSE41: # %bb.0:
; SSE41-NEXT: pminsb %xmm2, %xmm0
; SSE41-NEXT: pminsb %xmm3, %xmm1
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_lt_v32i8:
; SSE42: # %bb.0:
; SSE42-NEXT: pminsb %xmm2, %xmm0
; SSE42-NEXT: pminsb %xmm3, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: min_lt_v32i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: min_lt_v32i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: min_lt_v32i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp slt <32 x i8> %a, %b
%2 = select <32 x i1> %1, <32 x i8> %a, <32 x i8> %b
ret <32 x i8> %2
}
;
; Signed Minimum (LE)
;
define <2 x i64> @min_le_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: min_le_v2i64:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
; SSE2-NEXT: pxor %xmm1, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: pand %xmm5, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm2, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm3
; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_le_v2i64:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: pxor %xmm0, %xmm3
+; SSE41-NEXT: pxor %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_le_v2i64:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm2
; SSE42-NEXT: movdqa %xmm1, %xmm0
; SSE42-NEXT: pcmpgtq %xmm2, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE42-NEXT: movapd %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: min_le_v2i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: min_le_v2i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: min_le_v2i64:
; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = icmp sle <2 x i64> %a, %b
%2 = select <2 x i1> %1, <2 x i64> %a, <2 x i64> %b
ret <2 x i64> %2
}
define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: min_le_v4i64:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: pxor %xmm4, %xmm5
; SSE2-NEXT: movdqa %xmm2, %xmm6
; SSE2-NEXT: pxor %xmm4, %xmm6
; SSE2-NEXT: movdqa %xmm6, %xmm7
; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
; SSE2-NEXT: pand %xmm8, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
; SSE2-NEXT: por %xmm5, %xmm6
; SSE2-NEXT: pand %xmm6, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm6
; SSE2-NEXT: por %xmm6, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm2
; SSE2-NEXT: pxor %xmm3, %xmm4
; SSE2-NEXT: movdqa %xmm4, %xmm5
; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm2, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
; SSE2-NEXT: por %xmm2, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pandn %xmm3, %xmm4
; SSE2-NEXT: por %xmm4, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_le_v4i64:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm4
; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
; SSE41-NEXT: pxor %xmm5, %xmm0
; SSE41-NEXT: movdqa %xmm2, %xmm6
; SSE41-NEXT: pxor %xmm5, %xmm6
; SSE41-NEXT: movdqa %xmm6, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
+; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
; SSE41-NEXT: pxor %xmm3, %xmm5
; SSE41-NEXT: movdqa %xmm5, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: movapd %xmm3, %xmm1
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_le_v4i64:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm4
; SSE42-NEXT: movdqa %xmm2, %xmm0
; SSE42-NEXT: pcmpgtq %xmm4, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE42-NEXT: movdqa %xmm3, %xmm0
; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
; SSE42-NEXT: movapd %xmm2, %xmm0
; SSE42-NEXT: movapd %xmm3, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: min_le_v4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: min_le_v4i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: min_le_v4i64:
; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512-NEXT: retq
%1 = icmp sle <4 x i64> %a, %b
%2 = select <4 x i1> %1, <4 x i64> %a, <4 x i64> %b
ret <4 x i64> %2
}
define <4 x i32> @min_le_v4i32(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: min_le_v4i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_le_v4i32:
; SSE41: # %bb.0:
; SSE41-NEXT: pminsd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_le_v4i32:
; SSE42: # %bb.0:
; SSE42-NEXT: pminsd %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: min_le_v4i32:
; AVX: # %bb.0:
; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp sle <4 x i32> %a, %b
%2 = select <4 x i1> %1, <4 x i32> %a, <4 x i32> %b
ret <4 x i32> %2
}
define <8 x i32> @min_le_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: min_le_v8i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm4
; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pandn %xmm3, %xmm2
; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_le_v8i32:
; SSE41: # %bb.0:
; SSE41-NEXT: pminsd %xmm2, %xmm0
; SSE41-NEXT: pminsd %xmm3, %xmm1
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_le_v8i32:
; SSE42: # %bb.0:
; SSE42-NEXT: pminsd %xmm2, %xmm0
; SSE42-NEXT: pminsd %xmm3, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: min_le_v8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: min_le_v8i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: min_le_v8i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp sle <8 x i32> %a, %b
%2 = select <8 x i1> %1, <8 x i32> %a, <8 x i32> %b
ret <8 x i32> %2
}
define <8 x i16> @min_le_v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: min_le_v8i16:
; SSE: # %bb.0:
; SSE-NEXT: pminsw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: min_le_v8i16:
; AVX: # %bb.0:
; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp sle <8 x i16> %a, %b
%2 = select <8 x i1> %1, <8 x i16> %a, <8 x i16> %b
ret <8 x i16> %2
}
define <16 x i16> @min_le_v16i16(<16 x i16> %a, <16 x i16> %b) {
; SSE-LABEL: min_le_v16i16:
; SSE: # %bb.0:
; SSE-NEXT: pminsw %xmm2, %xmm0
; SSE-NEXT: pminsw %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: min_le_v16i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: min_le_v16i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: min_le_v16i16:
; AVX512: # %bb.0:
; AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp sle <16 x i16> %a, %b
%2 = select <16 x i1> %1, <16 x i16> %a, <16 x i16> %b
ret <16 x i16> %2
}
define <16 x i8> @min_le_v16i8(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: min_le_v16i8:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_le_v16i8:
; SSE41: # %bb.0:
; SSE41-NEXT: pminsb %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_le_v16i8:
; SSE42: # %bb.0:
; SSE42-NEXT: pminsb %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: min_le_v16i8:
; AVX: # %bb.0:
; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp sle <16 x i8> %a, %b
%2 = select <16 x i1> %1, <16 x i8> %a, <16 x i8> %b
ret <16 x i8> %2
}
define <32 x i8> @min_le_v32i8(<32 x i8> %a, <32 x i8> %b) {
; SSE2-LABEL: min_le_v32i8:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: pcmpgtb %xmm0, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm4
; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pandn %xmm3, %xmm2
; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_le_v32i8:
; SSE41: # %bb.0:
; SSE41-NEXT: pminsb %xmm2, %xmm0
; SSE41-NEXT: pminsb %xmm3, %xmm1
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_le_v32i8:
; SSE42: # %bb.0:
; SSE42-NEXT: pminsb %xmm2, %xmm0
; SSE42-NEXT: pminsb %xmm3, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: min_le_v32i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: min_le_v32i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: min_le_v32i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp sle <32 x i8> %a, %b
%2 = select <32 x i1> %1, <32 x i8> %a, <32 x i8> %b
ret <32 x i8> %2
}
;
; Constant Folding
;
define <2 x i64> @max_gt_v2i64c() {
; SSE-LABEL: max_gt_v2i64c:
; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551615,7]
; SSE-NEXT: retq
;
; AVX-LABEL: max_gt_v2i64c:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551615,7]
; AVX-NEXT: retq
%1 = insertelement <2 x i64> <i64 -7, i64 7>, i64 -7, i32 0
%2 = insertelement <2 x i64> <i64 -1, i64 1>, i64 -1, i32 0
%3 = icmp sgt <2 x i64> %1, %2
%4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> %2
ret <2 x i64> %4
}
define <4 x i64> @max_gt_v4i64c() {
; SSE-LABEL: max_gt_v4i64c:
; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm1 = [7,7]
; SSE-NEXT: pcmpeqd %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: max_gt_v4i64c:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7]
; AVX-NEXT: retq
%1 = insertelement <4 x i64> <i64 -7, i64 -1, i64 1, i64 7>, i64 -7, i32 0
%2 = insertelement <4 x i64> <i64 -1, i64 -7, i64 7, i64 1>, i64 -1, i32 0
%3 = icmp sgt <4 x i64> %1, %2
%4 = select <4 x i1> %3, <4 x i64> %1, <4 x i64> %2
ret <4 x i64> %4
}
define <4 x i32> @max_gt_v4i32c() {
; SSE-LABEL: max_gt_v4i32c:
; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
; SSE-NEXT: retq
;
; AVX-LABEL: max_gt_v4i32c:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
; AVX-NEXT: retq
%1 = insertelement <4 x i32> <i32 -7, i32 -1, i32 1, i32 7>, i32 -7, i32 0
%2 = insertelement <4 x i32> <i32 -1, i32 -7, i32 7, i32 1>, i32 -1, i32 0
%3 = icmp sgt <4 x i32> %1, %2
%4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> %2
ret <4 x i32> %4
}
define <8 x i32> @max_gt_v8i32c() {
; SSE-LABEL: max_gt_v8i32c:
; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,4294967293,4294967293,4294967295]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [7,5,5,7]
; SSE-NEXT: retq
;
; AVX-LABEL: max_gt_v8i32c:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7]
; AVX-NEXT: retq
%1 = insertelement <8 x i32> <i32 -7, i32 -5, i32 -3, i32 -1, i32 1, i32 3, i32 5, i32 7>, i32 -7, i32 0
%2 = insertelement <8 x i32> <i32 -1, i32 -3, i32 -5, i32 -7, i32 7, i32 5, i32 3, i32 1>, i32 -1, i32 0
%3 = icmp sgt <8 x i32> %1, %2
%4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> %2
ret <8 x i32> %4
}
define <8 x i16> @max_gt_v8i16c() {
; SSE-LABEL: max_gt_v8i16c:
; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7]
; SSE-NEXT: retq
;
; AVX-LABEL: max_gt_v8i16c:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7]
; AVX-NEXT: retq
%1 = insertelement <8 x i16> <i16 -7, i16 -5, i16 -3, i16 -1, i16 1, i16 3, i16 5, i16 7>, i16 -7, i32 0
%2 = insertelement <8 x i16> <i16 -1, i16 -3, i16 -5, i16 -7, i16 7, i16 5, i16 3, i16 1>, i16 -1, i32 0
%3 = icmp sgt <8 x i16> %1, %2
%4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> %2
ret <8 x i16> %4
}
define <16 x i16> @max_gt_v16i16c() {
; SSE-LABEL: max_gt_v16i16c:
; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,65534,65533,65532,65533,65534,65535,0]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [7,6,5,4,5,6,7,8]
; SSE-NEXT: retq
;
; AVX-LABEL: max_gt_v16i16c:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8]
; AVX-NEXT: retq
%1 = insertelement <16 x i16> <i16 -7, i16 -6, i16 -5, i16 -4, i16 -3, i16 -2, i16 -1, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, i16 -7, i32 0
%2 = insertelement <16 x i16> <i16 -1, i16 -2, i16 -3, i16 -4, i16 -5, i16 -6, i16 -7, i16 0, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, i16 -1, i32 0
%3 = icmp sgt <16 x i16> %1, %2
%4 = select <16 x i1> %3, <16 x i16> %1, <16 x i16> %2
ret <16 x i16> %4
}
define <16 x i8> @max_gt_v16i8c() {
; SSE-LABEL: max_gt_v16i8c:
; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8]
; SSE-NEXT: retq
;
; AVX-LABEL: max_gt_v16i8c:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8]
; AVX-NEXT: retq
%1 = insertelement <16 x i8> <i8 -7, i8 -6, i8 -5, i8 -4, i8 -3, i8 -2, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, i8 -7, i32 0
%2 = insertelement <16 x i8> <i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, i8 -1, i32 0
%3 = icmp sgt <16 x i8> %1, %2
%4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> %2
ret <16 x i8> %4
}
define <2 x i64> @max_ge_v2i64c() {
; SSE-LABEL: max_ge_v2i64c:
; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551615,7]
; SSE-NEXT: retq
;
; AVX-LABEL: max_ge_v2i64c:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551615,7]
; AVX-NEXT: retq
%1 = insertelement <2 x i64> <i64 -7, i64 7>, i64 -7, i32 0
%2 = insertelement <2 x i64> <i64 -1, i64 1>, i64 -1, i32 0
%3 = icmp sge <2 x i64> %1, %2
%4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> %2
ret <2 x i64> %4
}
define <4 x i64> @max_ge_v4i64c() {
; SSE-LABEL: max_ge_v4i64c:
; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm1 = [7,7]
; SSE-NEXT: pcmpeqd %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: max_ge_v4i64c:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7]
; AVX-NEXT: retq
%1 = insertelement <4 x i64> <i64 -7, i64 -1, i64 1, i64 7>, i64 -7, i32 0
%2 = insertelement <4 x i64> <i64 -1, i64 -7, i64 7, i64 1>, i64 -1, i32 0
%3 = icmp sge <4 x i64> %1, %2
%4 = select <4 x i1> %3, <4 x i64> %1, <4 x i64> %2
ret <4 x i64> %4
}
define <4 x i32> @max_ge_v4i32c() {
; SSE-LABEL: max_ge_v4i32c:
; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
; SSE-NEXT: retq
;
; AVX-LABEL: max_ge_v4i32c:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
; AVX-NEXT: retq
%1 = insertelement <4 x i32> <i32 -7, i32 -1, i32 1, i32 7>, i32 -7, i32 0
%2 = insertelement <4 x i32> <i32 -1, i32 -7, i32 7, i32 1>, i32 -1, i32 0
%3 = icmp sge <4 x i32> %1, %2
%4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> %2
ret <4 x i32> %4
}
define <8 x i32> @max_ge_v8i32c() {
; SSE-LABEL: max_ge_v8i32c:
; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,4294967293,4294967293,4294967295]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [7,5,5,7]
; SSE-NEXT: retq
;
; AVX-LABEL: max_ge_v8i32c:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7]
; AVX-NEXT: retq
%1 = insertelement <8 x i32> <i32 -7, i32 -5, i32 -3, i32 -1, i32 1, i32 3, i32 5, i32 7>, i32 -7, i32 0
%2 = insertelement <8 x i32> <i32 -1, i32 -3, i32 -5, i32 -7, i32 7, i32 5, i32 3, i32 1>, i32 -1, i32 0
%3 = icmp sge <8 x i32> %1, %2
%4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> %2
ret <8 x i32> %4
}
define <8 x i16> @max_ge_v8i16c() {
; SSE-LABEL: max_ge_v8i16c:
; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7]
; SSE-NEXT: retq
;
; AVX-LABEL: max_ge_v8i16c:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7]
; AVX-NEXT: retq
%1 = insertelement <8 x i16> <i16 -7, i16 -5, i16 -3, i16 -1, i16 1, i16 3, i16 5, i16 7>, i16 -7, i32 0
%2 = insertelement <8 x i16> <i16 -1, i16 -3, i16 -5, i16 -7, i16 7, i16 5, i16 3, i16 1>, i16 -1, i32 0
%3 = icmp sge <8 x i16> %1, %2
%4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> %2
ret <8 x i16> %4
}
define <16 x i16> @max_ge_v16i16c() {
; SSE-LABEL: max_ge_v16i16c:
; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,65534,65533,65532,65533,65534,65535,0]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [7,6,5,4,5,6,7,8]
; SSE-NEXT: retq
;
; AVX-LABEL: max_ge_v16i16c:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8]
; AVX-NEXT: retq
%1 = insertelement <16 x i16> <i16 -7, i16 -6, i16 -5, i16 -4, i16 -3, i16 -2, i16 -1, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, i16 -7, i32 0
%2 = insertelement <16 x i16> <i16 -1, i16 -2, i16 -3, i16 -4, i16 -5, i16 -6, i16 -7, i16 0, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, i16 -1, i32 0
%3 = icmp sge <16 x i16> %1, %2
%4 = select <16 x i1> %3, <16 x i16> %1, <16 x i16> %2
ret <16 x i16> %4
}
define <16 x i8> @max_ge_v16i8c() {
; SSE-LABEL: max_ge_v16i8c:
; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8]
; SSE-NEXT: retq
;
; AVX-LABEL: max_ge_v16i8c:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8]
; AVX-NEXT: retq
%1 = insertelement <16 x i8> <i8 -7, i8 -6, i8 -5, i8 -4, i8 -3, i8 -2, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, i8 -7, i32 0
%2 = insertelement <16 x i8> <i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, i8 -1, i32 0
%3 = icmp sge <16 x i8> %1, %2
%4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> %2
ret <16 x i8> %4
}
define <2 x i64> @min_lt_v2i64c() {
; SSE-LABEL: min_lt_v2i64c:
; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551609,1]
; SSE-NEXT: retq
;
; AVX-LABEL: min_lt_v2i64c:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551609,1]
; AVX-NEXT: retq
%1 = insertelement <2 x i64> <i64 -7, i64 7>, i64 -7, i32 0
%2 = insertelement <2 x i64> <i64 -1, i64 1>, i64 -1, i32 0
%3 = icmp slt <2 x i64> %1, %2
%4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> %2
ret <2 x i64> %4
}
define <4 x i64> @min_lt_v4i64c() {
; SSE-LABEL: min_lt_v4i64c:
; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551609,18446744073709551609]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,1]
; SSE-NEXT: retq
;
; AVX-LABEL: min_lt_v4i64c:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1]
; AVX-NEXT: retq
%1 = insertelement <4 x i64> <i64 -7, i64 -1, i64 1, i64 7>, i64 -7, i32 0
%2 = insertelement <4 x i64> <i64 -1, i64 -7, i64 7, i64 1>, i64 -1, i32 0
%3 = icmp slt <4 x i64> %1, %2
%4 = select <4 x i1> %3, <4 x i64> %1, <4 x i64> %2
ret <4 x i64> %4
}
define <4 x i32> @min_lt_v4i32c() {
; SSE-LABEL: min_lt_v4i32c:
; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
; SSE-NEXT: retq
;
; AVX-LABEL: min_lt_v4i32c:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
; AVX-NEXT: retq
%1 = insertelement <4 x i32> <i32 -7, i32 -1, i32 1, i32 7>, i32 -7, i32 0
%2 = insertelement <4 x i32> <i32 -1, i32 -7, i32 7, i32 1>, i32 -1, i32 0
%3 = icmp slt <4 x i32> %1, %2
%4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> %2
ret <4 x i32> %4
}
define <8 x i32> @min_lt_v8i32c() {
; SSE-LABEL: min_lt_v8i32c:
; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967289,4294967291,4294967291,4294967289]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,3,3,1]
; SSE-NEXT: retq
;
; AVX-LABEL: min_lt_v8i32c:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1]
; AVX-NEXT: retq
%1 = insertelement <8 x i32> <i32 -7, i32 -5, i32 -3, i32 -1, i32 1, i32 3, i32 5, i32 7>, i32 -7, i32 0
%2 = insertelement <8 x i32> <i32 -1, i32 -3, i32 -5, i32 -7, i32 7, i32 5, i32 3, i32 1>, i32 -1, i32 0
%3 = icmp slt <8 x i32> %1, %2
%4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> %2
ret <8 x i32> %4
}
define <8 x i16> @min_lt_v8i16c() {
; SSE-LABEL: min_lt_v8i16c:
; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1]
; SSE-NEXT: retq
;
; AVX-LABEL: min_lt_v8i16c:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1]
; AVX-NEXT: retq
%1 = insertelement <8 x i16> <i16 -7, i16 -5, i16 -3, i16 -1, i16 1, i16 3, i16 5, i16 7>, i16 -7, i32 0
%2 = insertelement <8 x i16> <i16 -1, i16 -3, i16 -5, i16 -7, i16 7, i16 5, i16 3, i16 1>, i16 -1, i32 0
%3 = icmp slt <8 x i16> %1, %2
%4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> %2
ret <8 x i16> %4
}
define <16 x i16> @min_lt_v16i16c() {
; SSE-LABEL: min_lt_v16i16c:
; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [65529,65530,65531,65532,65531,65530,65529,0]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,2,3,4,3,2,1,0]
; SSE-NEXT: retq
;
; AVX-LABEL: min_lt_v16i16c:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65529,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0]
; AVX-NEXT: retq
%1 = insertelement <16 x i16> <i16 -7, i16 -6, i16 -5, i16 -4, i16 -3, i16 -2, i16 -1, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, i16 -7, i32 0
%2 = insertelement <16 x i16> <i16 -1, i16 -2, i16 -3, i16 -4, i16 -5, i16 -6, i16 -7, i16 0, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, i16 -1, i32 0
%3 = icmp slt <16 x i16> %1, %2
%4 = select <16 x i1> %3, <16 x i16> %1, <16 x i16> %2
ret <16 x i16> %4
}
define <16 x i8> @min_lt_v16i8c() {
; SSE-LABEL: min_lt_v16i8c:
; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0]
; SSE-NEXT: retq
;
; AVX-LABEL: min_lt_v16i8c:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0]
; AVX-NEXT: retq
%1 = insertelement <16 x i8> <i8 -7, i8 -6, i8 -5, i8 -4, i8 -3, i8 -2, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, i8 -7, i32 0
%2 = insertelement <16 x i8> <i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, i8 -1, i32 0
%3 = icmp slt <16 x i8> %1, %2
%4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> %2
ret <16 x i8> %4
}
define <2 x i64> @min_le_v2i64c() {
; SSE-LABEL: min_le_v2i64c:
; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551609,1]
; SSE-NEXT: retq
;
; AVX-LABEL: min_le_v2i64c:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551609,1]
; AVX-NEXT: retq
%1 = insertelement <2 x i64> <i64 -7, i64 7>, i64 -7, i32 0
%2 = insertelement <2 x i64> <i64 -1, i64 1>, i64 -1, i32 0
%3 = icmp sle <2 x i64> %1, %2
%4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> %2
ret <2 x i64> %4
}
define <4 x i64> @min_le_v4i64c() {
; SSE-LABEL: min_le_v4i64c:
; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551609,18446744073709551609]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,1]
; SSE-NEXT: retq
;
; AVX-LABEL: min_le_v4i64c:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1]
; AVX-NEXT: retq
%1 = insertelement <4 x i64> <i64 -7, i64 -1, i64 1, i64 7>, i64 -7, i32 0
%2 = insertelement <4 x i64> <i64 -1, i64 -7, i64 7, i64 1>, i64 -1, i32 0
%3 = icmp sle <4 x i64> %1, %2
%4 = select <4 x i1> %3, <4 x i64> %1, <4 x i64> %2
ret <4 x i64> %4
}
define <4 x i32> @min_le_v4i32c() {
; SSE-LABEL: min_le_v4i32c:
; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
; SSE-NEXT: retq
;
; AVX-LABEL: min_le_v4i32c:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
; AVX-NEXT: retq
%1 = insertelement <4 x i32> <i32 -7, i32 -1, i32 1, i32 7>, i32 -7, i32 0
%2 = insertelement <4 x i32> <i32 -1, i32 -7, i32 7, i32 1>, i32 -1, i32 0
%3 = icmp sle <4 x i32> %1, %2
%4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> %2
ret <4 x i32> %4
}
define <8 x i32> @min_le_v8i32c() {
; SSE-LABEL: min_le_v8i32c:
; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967289,4294967291,4294967291,4294967289]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,3,3,1]
; SSE-NEXT: retq
;
; AVX-LABEL: min_le_v8i32c:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1]
; AVX-NEXT: retq
%1 = insertelement <8 x i32> <i32 -7, i32 -5, i32 -3, i32 -1, i32 1, i32 3, i32 5, i32 7>, i32 -7, i32 0
%2 = insertelement <8 x i32> <i32 -1, i32 -3, i32 -5, i32 -7, i32 7, i32 5, i32 3, i32 1>, i32 -1, i32 0
%3 = icmp sle <8 x i32> %1, %2
%4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> %2
ret <8 x i32> %4
}
define <8 x i16> @min_le_v8i16c() {
; SSE-LABEL: min_le_v8i16c:
; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1]
; SSE-NEXT: retq
;
; AVX-LABEL: min_le_v8i16c:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1]
; AVX-NEXT: retq
%1 = insertelement <8 x i16> <i16 -7, i16 -5, i16 -3, i16 -1, i16 1, i16 3, i16 5, i16 7>, i16 -7, i32 0
%2 = insertelement <8 x i16> <i16 -1, i16 -3, i16 -5, i16 -7, i16 7, i16 5, i16 3, i16 1>, i16 -1, i32 0
%3 = icmp sle <8 x i16> %1, %2
%4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> %2
ret <8 x i16> %4
}
define <16 x i16> @min_le_v16i16c() {
; SSE-LABEL: min_le_v16i16c:
; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [65529,65530,65531,65532,65531,65530,65529,0]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,2,3,4,3,2,1,0]
; SSE-NEXT: retq
;
; AVX-LABEL: min_le_v16i16c:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65529,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0]
; AVX-NEXT: retq
%1 = insertelement <16 x i16> <i16 -7, i16 -6, i16 -5, i16 -4, i16 -3, i16 -2, i16 -1, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, i16 -7, i32 0
%2 = insertelement <16 x i16> <i16 -1, i16 -2, i16 -3, i16 -4, i16 -5, i16 -6, i16 -7, i16 0, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, i16 -1, i32 0
%3 = icmp sle <16 x i16> %1, %2
%4 = select <16 x i1> %3, <16 x i16> %1, <16 x i16> %2
ret <16 x i16> %4
}
define <16 x i8> @min_le_v16i8c() {
; SSE-LABEL: min_le_v16i8c:
; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0]
; SSE-NEXT: retq
;
; AVX-LABEL: min_le_v16i8c:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0]
; AVX-NEXT: retq
%1 = insertelement <16 x i8> <i8 -7, i8 -6, i8 -5, i8 -4, i8 -3, i8 -2, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, i8 -7, i32 0
%2 = insertelement <16 x i8> <i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, i8 -1, i32 0
%3 = icmp sle <16 x i8> %1, %2
%4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> %2
ret <16 x i8> %4
}
Index: llvm/trunk/test/CodeGen/X86/zext-logicop-shift-load.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/zext-logicop-shift-load.ll (revision 357115)
+++ llvm/trunk/test/CodeGen/X86/zext-logicop-shift-load.ll (revision 357116)
@@ -1,120 +1,115 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
-; FIXME: masked extend should be folded into and.
define i64 @test1(i8* %data) {
; CHECK-LABEL: test1:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movb (%rdi), %al
-; CHECK-NEXT: shlb $2, %al
-; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: movzbl (%rdi), %eax
+; CHECK-NEXT: shlq $2, %rax
; CHECK-NEXT: andl $60, %eax
; CHECK-NEXT: retq
entry:
%bf.load = load i8, i8* %data, align 4
%bf.clear = shl i8 %bf.load, 2
%0 = and i8 %bf.clear, 60
%mul = zext i8 %0 to i64
ret i64 %mul
}
-; FIXME: masked extend should be folded into and.
define i8* @test2(i8* %data) {
; CHECK-LABEL: test2:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movb (%rdi), %al
-; CHECK-NEXT: shlb $2, %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: andl $60, %eax
-; CHECK-NEXT: addq %rdi, %rax
+; CHECK-NEXT: movzbl (%rdi), %eax
+; CHECK-NEXT: andl $15, %eax
+; CHECK-NEXT: leaq (%rdi,%rax,4), %rax
; CHECK-NEXT: retq
entry:
%bf.load = load i8, i8* %data, align 4
%bf.clear = shl i8 %bf.load, 2
%0 = and i8 %bf.clear, 60
%mul = zext i8 %0 to i64
%add.ptr = getelementptr inbounds i8, i8* %data, i64 %mul
ret i8* %add.ptr
}
; If the shift op is SHL, the logic op can only be AND.
define i64 @test3(i8* %data) {
; CHECK-LABEL: test3:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movb (%rdi), %al
; CHECK-NEXT: shlb $2, %al
; CHECK-NEXT: xorb $60, %al
; CHECK-NEXT: movzbl %al, %eax
; CHECK-NEXT: retq
entry:
%bf.load = load i8, i8* %data, align 4
%bf.clear = shl i8 %bf.load, 2
%0 = xor i8 %bf.clear, 60
%mul = zext i8 %0 to i64
ret i64 %mul
}
define i64 @test4(i8* %data) {
; CHECK-LABEL: test4:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movzbl (%rdi), %eax
; CHECK-NEXT: shrq $2, %rax
; CHECK-NEXT: andl $60, %eax
; CHECK-NEXT: retq
entry:
%bf.load = load i8, i8* %data, align 4
%bf.clear = lshr i8 %bf.load, 2
%0 = and i8 %bf.clear, 60
%1 = zext i8 %0 to i64
ret i64 %1
}
define i64 @test5(i8* %data) {
; CHECK-LABEL: test5:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movzbl (%rdi), %eax
; CHECK-NEXT: shrq $2, %rax
; CHECK-NEXT: xorq $60, %rax
; CHECK-NEXT: retq
entry:
%bf.load = load i8, i8* %data, align 4
%bf.clear = lshr i8 %bf.load, 2
%0 = xor i8 %bf.clear, 60
%1 = zext i8 %0 to i64
ret i64 %1
}
define i64 @test6(i8* %data) {
; CHECK-LABEL: test6:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movzbl (%rdi), %eax
; CHECK-NEXT: shrq $2, %rax
; CHECK-NEXT: orq $60, %rax
; CHECK-NEXT: retq
entry:
%bf.load = load i8, i8* %data, align 4
%bf.clear = lshr i8 %bf.load, 2
%0 = or i8 %bf.clear, 60
%1 = zext i8 %0 to i64
ret i64 %1
}
; Load is folded with sext.
define i64 @test8(i8* %data) {
; CHECK-LABEL: test8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movsbl (%rdi), %eax
; CHECK-NEXT: movzwl %ax, %eax
; CHECK-NEXT: shrl $2, %eax
; CHECK-NEXT: orl $60, %eax
; CHECK-NEXT: retq
entry:
%bf.load = load i8, i8* %data, align 4
%ext = sext i8 %bf.load to i16
%bf.clear = lshr i16 %ext, 2
%0 = or i16 %bf.clear, 60
%1 = zext i16 %0 to i64
ret i64 %1
}
Index: llvm/trunk/test/CodeGen/X86/vector-narrow-binop.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/vector-narrow-binop.ll (revision 357115)
+++ llvm/trunk/test/CodeGen/X86/vector-narrow-binop.ll (revision 357116)
@@ -1,149 +1,165 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512DQ
; AVX1 has support for 256-bit bitwise logic because the FP variants were included.
; If using those ops requires extra insert/extract though, it's probably not worth it.
define <8 x i32> @PR32790(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
; SSE-LABEL: PR32790:
; SSE: # %bb.0:
; SSE-NEXT: paddd %xmm2, %xmm0
; SSE-NEXT: paddd %xmm3, %xmm1
; SSE-NEXT: pand %xmm5, %xmm1
; SSE-NEXT: pand %xmm4, %xmm0
; SSE-NEXT: psubd %xmm6, %xmm0
; SSE-NEXT: psubd %xmm7, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: PR32790:
; AVX1: # %bb.0:
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm1
; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: PR32790:
; AVX2: # %bb.0:
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpsubd %ymm3, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: PR32790:
; AVX512: # %bb.0:
; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX512-NEXT: vpsubd %ymm3, %ymm0, %ymm0
; AVX512-NEXT: retq
%add = add <8 x i32> %a, %b
%and = and <8 x i32> %add, %c
%sub = sub <8 x i32> %and, %d
ret <8 x i32> %sub
}
; In a more extreme case, even the later AVX targets should avoid extract/insert just
; because 256-bit ops are supported.
define <4 x i32> @do_not_use_256bit_op(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
; SSE-LABEL: do_not_use_256bit_op:
; SSE: # %bb.0:
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: pand %xmm3, %xmm1
; SSE-NEXT: psubd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: do_not_use_256bit_op:
; AVX: # %bb.0:
; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpand %xmm3, %xmm1, %xmm1
; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%concat1 = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%concat2 = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%and = and <8 x i32> %concat1, %concat2
%extract1 = shufflevector <8 x i32> %and, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%extract2 = shufflevector <8 x i32> %and, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%sub = sub <4 x i32> %extract1, %extract2
ret <4 x i32> %sub
}
; When extracting from a vector binop, the source width should be a multiple of the destination width.
; https://bugs.llvm.org/show_bug.cgi?id=39511
define <3 x float> @PR39511(<4 x float> %t0, <3 x float>* %b) {
; SSE-LABEL: PR39511:
; SSE: # %bb.0:
; SSE-NEXT: addps {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: PR39511:
; AVX: # %bb.0:
; AVX-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
%add = fadd <4 x float> %t0, <float 1.0, float 2.0, float 3.0, float 4.0>
%ext = shufflevector <4 x float> %add, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
ret <3 x float> %ext
}
; When extracting from a vector binop, we need to be extracting
; by a width of at least 1 of the original vector elements.
; https://bugs.llvm.org/show_bug.cgi?id=39893
define <2 x i8> @PR39893(<2 x i32> %x, <8 x i8> %y) {
; SSE-LABEL: PR39893:
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm2, %xmm2
; SSE-NEXT: psubd %xmm0, %xmm2
; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3]
; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: PR39893:
-; AVX: # %bb.0:
-; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpsubd %xmm0, %xmm2, %xmm0
-; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[1],zero,xmm1[2],zero
-; AVX-NEXT: retq
+; AVX1-LABEL: PR39893:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2],zero,xmm0[3],zero,xmm0[2],zero,xmm0[3],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: PR39893:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpsubd %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2],zero,xmm0[3],zero,xmm0[2],zero,xmm0[3],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: PR39893:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vpsubd %xmm0, %xmm2, %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2],zero,xmm0[3],zero,xmm0[2],zero,xmm0[3],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512-NEXT: retq
%sub = sub <2 x i32> <i32 0, i32 undef>, %x
%bc = bitcast <2 x i32> %sub to <8 x i8>
%shuffle = shufflevector <8 x i8> %y, <8 x i8> %bc, <2 x i32> <i32 10, i32 4>
ret <2 x i8> %shuffle
}
define <2 x i8> @PR39893_2(<2 x float> %x) {
; SSE-LABEL: PR39893_2:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: subps %xmm0, %xmm1
; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
; SSE-NEXT: retq
;
; AVX-LABEL: PR39893_2:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vsubps %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: retq
%fsub = fsub <2 x float> zeroinitializer, %x
%bc = bitcast <2 x float> %fsub to <8 x i8>
%shuffle = shufflevector <8 x i8> %bc, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
ret <2 x i8> %shuffle
}
Index: llvm/trunk/test/CodeGen/X86/vector-trunc-usat-widen.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/vector-trunc-usat-widen.ll (revision 357115)
+++ llvm/trunk/test/CodeGen/X86/vector-trunc-usat-widen.ll (revision 357116)
@@ -1,2475 +1,2481 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
;
; Unsigned saturation truncation to vXi32
;
define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) {
; SSE2-LABEL: trunc_usat_v4i64_v4i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455]
; SSE2-NEXT: movdqa %xmm4, %xmm5
; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm4, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSE2-NEXT: por %xmm3, %xmm5
; SSE2-NEXT: pxor %xmm1, %xmm2
; SSE2-NEXT: movdqa %xmm4, %xmm3
; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm4, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2-NEXT: por %xmm2, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm1
; SSE2-NEXT: pandn {{.*}}(%rip), %xmm3
; SSE2-NEXT: por %xmm1, %xmm3
; SSE2-NEXT: pand %xmm5, %xmm0
; SSE2-NEXT: pandn {{.*}}(%rip), %xmm5
; SSE2-NEXT: por %xmm5, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_usat_v4i64_v4i32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
; SSSE3-NEXT: movdqa %xmm0, %xmm3
; SSSE3-NEXT: pxor %xmm2, %xmm3
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455]
; SSSE3-NEXT: movdqa %xmm4, %xmm5
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm4, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSSE3-NEXT: por %xmm3, %xmm5
; SSSE3-NEXT: pxor %xmm1, %xmm2
; SSSE3-NEXT: movdqa %xmm4, %xmm3
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSSE3-NEXT: por %xmm2, %xmm3
; SSSE3-NEXT: pand %xmm3, %xmm1
; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm3
; SSSE3-NEXT: por %xmm1, %xmm3
; SSSE3-NEXT: pand %xmm5, %xmm0
; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm5
; SSSE3-NEXT: por %xmm5, %xmm0
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_usat_v4i64_v4i32:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
; SSE41-NEXT: movdqa %xmm2, %xmm3
; SSE41-NEXT: pxor %xmm0, %xmm3
; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455]
; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm5
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm3
-; SSE41-NEXT: por %xmm6, %xmm3
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE41-NEXT: pand %xmm6, %xmm3
+; SSE41-NEXT: por %xmm5, %xmm3
; SSE41-NEXT: pxor %xmm1, %xmm0
; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4294967295,4294967295]
; SSE41-NEXT: movapd {{.*#+}} xmm5 = [4294967295,429496729]
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4
; SSE41-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2]
; SSE41-NEXT: movaps %xmm4, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc_usat_v4i64_v4i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729]
; AVX1-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_usat_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX2-SLOW-NEXT: vpxor %ymm1, %ymm0, %ymm1
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1
; AVX2-SLOW-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729]
; AVX2-SLOW-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0
; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc_usat_v4i64_v4i32:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX2-FAST-NEXT: vpxor %ymm1, %ymm0, %ymm1
; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1
; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729]
; AVX2-FAST-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0
; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: trunc_usat_v4i64_v4i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
; AVX512F-NEXT: vpcmpltuq %zmm1, %zmm0, %k1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729]
; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vpmovqd %zmm1, %ymm0
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_usat_v4i64_v4i32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpcmpltuq {{.*}}(%rip){1to4}, %ymm0, %k1
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729]
; AVX512VL-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1}
; AVX512VL-NEXT: vpmovqd %ymm1, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_usat_v4i64_v4i32:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
; AVX512BW-NEXT: vpcmpltuq %zmm1, %zmm0, %k1
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729]
; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_usat_v4i64_v4i32:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpcmpltuq {{.*}}(%rip){1to4}, %ymm0, %k1
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729]
; AVX512BWVL-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1}
; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm0
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%1 = icmp ult <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
%2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 429496729>
%3 = trunc <4 x i64> %2 to <4 x i32>
ret <4 x i32> %3
}
define <8 x i32> @trunc_usat_v8i64_v8i32(<8 x i64> %a0) {
; SSE2-LABEL: trunc_usat_v8i64_v8i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
; SSE2-NEXT: movdqa %xmm3, %xmm7
; SSE2-NEXT: pxor %xmm5, %xmm7
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455]
; SSE2-NEXT: movdqa %xmm9, %xmm6
; SSE2-NEXT: pcmpgtd %xmm7, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
; SSE2-NEXT: pand %xmm4, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
; SSE2-NEXT: por %xmm7, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm3
; SSE2-NEXT: pandn %xmm8, %xmm4
; SSE2-NEXT: por %xmm3, %xmm4
; SSE2-NEXT: movdqa %xmm2, %xmm3
; SSE2-NEXT: pxor %xmm5, %xmm3
; SSE2-NEXT: movdqa %xmm9, %xmm6
; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
; SSE2-NEXT: pand %xmm10, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
; SSE2-NEXT: por %xmm7, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm2
; SSE2-NEXT: pandn %xmm8, %xmm3
; SSE2-NEXT: por %xmm2, %xmm3
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm5, %xmm2
; SSE2-NEXT: movdqa %xmm9, %xmm4
; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm2, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pandn %xmm8, %xmm4
; SSE2-NEXT: por %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm0, %xmm5
; SSE2-NEXT: movdqa %xmm9, %xmm1
; SSE2-NEXT: pcmpgtd %xmm5, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSE2-NEXT: pand %xmm2, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: por %xmm5, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: pandn %xmm8, %xmm1
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
; SSE2-NEXT: movaps %xmm3, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_usat_v8i64_v8i32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
; SSSE3-NEXT: movdqa %xmm3, %xmm7
; SSSE3-NEXT: pxor %xmm5, %xmm7
; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455]
; SSSE3-NEXT: movdqa %xmm9, %xmm6
; SSSE3-NEXT: pcmpgtd %xmm7, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
; SSSE3-NEXT: pand %xmm4, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
; SSSE3-NEXT: por %xmm7, %xmm4
; SSSE3-NEXT: pand %xmm4, %xmm3
; SSSE3-NEXT: pandn %xmm8, %xmm4
; SSSE3-NEXT: por %xmm3, %xmm4
; SSSE3-NEXT: movdqa %xmm2, %xmm3
; SSSE3-NEXT: pxor %xmm5, %xmm3
; SSSE3-NEXT: movdqa %xmm9, %xmm6
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
; SSSE3-NEXT: pand %xmm10, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
; SSSE3-NEXT: por %xmm7, %xmm3
; SSSE3-NEXT: pand %xmm3, %xmm2
; SSSE3-NEXT: pandn %xmm8, %xmm3
; SSSE3-NEXT: por %xmm2, %xmm3
; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
; SSSE3-NEXT: movdqa %xmm1, %xmm2
; SSSE3-NEXT: pxor %xmm5, %xmm2
; SSSE3-NEXT: movdqa %xmm9, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm2, %xmm4
; SSSE3-NEXT: pand %xmm4, %xmm1
; SSSE3-NEXT: pandn %xmm8, %xmm4
; SSSE3-NEXT: por %xmm1, %xmm4
; SSSE3-NEXT: pxor %xmm0, %xmm5
; SSSE3-NEXT: movdqa %xmm9, %xmm1
; SSSE3-NEXT: pcmpgtd %xmm5, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSSE3-NEXT: pand %xmm2, %xmm5
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: por %xmm5, %xmm1
; SSSE3-NEXT: pand %xmm1, %xmm0
; SSSE3-NEXT: pandn %xmm8, %xmm1
; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
; SSSE3-NEXT: movaps %xmm3, %xmm1
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_usat_v8i64_v8i32:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm8
-; SSE41-NEXT: movapd {{.*#+}} xmm9 = [4294967295,4294967295]
-; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: movapd {{.*#+}} xmm6 = [4294967295,4294967295]
+; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456]
; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455]
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT: movdqa %xmm4, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm7, %xmm0
-; SSE41-NEXT: movapd %xmm9, %xmm6
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6
+; SSE41-NEXT: pxor %xmm7, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455]
+; SSE41-NEXT: movdqa %xmm9, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: movapd %xmm6, %xmm5
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5
; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm4, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm7, %xmm0
-; SSE41-NEXT: movapd %xmm9, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm6[0,2]
+; SSE41-NEXT: pxor %xmm7, %xmm0
+; SSE41-NEXT: movdqa %xmm9, %xmm3
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: movapd %xmm6, %xmm4
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4
+; SSE41-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2]
; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm9, %xmm2
+; SSE41-NEXT: pxor %xmm7, %xmm0
+; SSE41-NEXT: movdqa %xmm9, %xmm2
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: movapd %xmm6, %xmm2
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: pxor %xmm8, %xmm5
-; SSE41-NEXT: movdqa %xmm4, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm1
-; SSE41-NEXT: pcmpgtd %xmm5, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm9
-; SSE41-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm2[0,2]
-; SSE41-NEXT: movaps %xmm9, %xmm0
-; SSE41-NEXT: movaps %xmm3, %xmm1
+; SSE41-NEXT: pxor %xmm8, %xmm7
+; SSE41-NEXT: movdqa %xmm9, %xmm1
+; SSE41-NEXT: pcmpgtd %xmm7, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm9, %xmm7
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm6
+; SSE41-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm2[0,2]
+; SSE41-NEXT: movaps %xmm6, %xmm0
+; SSE41-NEXT: movaps %xmm4, %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc_usat_v8i64_v8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372041149743103,9223372041149743103]
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3
; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm6
; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm6
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3
; AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3
; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm4
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_usat_v8i64_v8i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm0, %ymm4
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm0, %ymm2, %ymm0
; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm3
; AVX2-SLOW-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc_usat_v8i64_v8i32:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX2-FAST-NEXT: vpxor %ymm3, %ymm1, %ymm4
; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
; AVX2-FAST-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
; AVX2-FAST-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1
; AVX2-FAST-NEXT: vpxor %ymm3, %ymm0, %ymm3
; AVX2-FAST-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_usat_v8i64_v8i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpmovusqd %zmm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp ult <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
%2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
%3 = trunc <8 x i64> %2 to <8 x i32>
ret <8 x i32> %3
}
;
; Unsigned saturation truncation to vXi16
;
define <8 x i16> @trunc_usat_v8i64_v8i16(<8 x i64> %a0) {
; SSE2-LABEL: trunc_usat_v8i64_v8i16:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535]
; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
; SSE2-NEXT: movdqa %xmm2, %xmm5
; SSE2-NEXT: pxor %xmm6, %xmm5
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002324991,9223372039002324991]
; SSE2-NEXT: movdqa %xmm9, %xmm7
; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
; SSE2-NEXT: pand %xmm10, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
; SSE2-NEXT: por %xmm4, %xmm5
; SSE2-NEXT: pand %xmm5, %xmm2
; SSE2-NEXT: pandn %xmm8, %xmm5
; SSE2-NEXT: por %xmm2, %xmm5
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: pxor %xmm6, %xmm2
; SSE2-NEXT: movdqa %xmm9, %xmm4
; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
; SSE2-NEXT: pand %xmm10, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm7, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: pandn %xmm8, %xmm2
; SSE2-NEXT: por %xmm3, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm6, %xmm3
; SSE2-NEXT: movdqa %xmm9, %xmm4
; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2-NEXT: pand %xmm7, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm3, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm8, %xmm4
; SSE2-NEXT: por %xmm0, %xmm4
; SSE2-NEXT: pxor %xmm1, %xmm6
; SSE2-NEXT: movdqa %xmm9, %xmm0
; SSE2-NEXT: pcmpgtd %xmm6, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
; SSE2-NEXT: pand %xmm3, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: por %xmm6, %xmm0
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: pandn %xmm8, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_usat_v8i64_v8i16:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535]
; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
; SSSE3-NEXT: movdqa %xmm2, %xmm5
; SSSE3-NEXT: pxor %xmm6, %xmm5
; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002324991,9223372039002324991]
; SSSE3-NEXT: movdqa %xmm9, %xmm7
; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
; SSSE3-NEXT: pand %xmm10, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
; SSSE3-NEXT: por %xmm4, %xmm5
; SSSE3-NEXT: pand %xmm5, %xmm2
; SSSE3-NEXT: pandn %xmm8, %xmm5
; SSSE3-NEXT: por %xmm2, %xmm5
; SSSE3-NEXT: movdqa %xmm3, %xmm2
; SSSE3-NEXT: pxor %xmm6, %xmm2
; SSSE3-NEXT: movdqa %xmm9, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
; SSSE3-NEXT: pand %xmm10, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm7, %xmm2
; SSSE3-NEXT: pand %xmm2, %xmm3
; SSSE3-NEXT: pandn %xmm8, %xmm2
; SSSE3-NEXT: por %xmm3, %xmm2
; SSSE3-NEXT: movdqa %xmm0, %xmm3
; SSSE3-NEXT: pxor %xmm6, %xmm3
; SSSE3-NEXT: movdqa %xmm9, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSSE3-NEXT: pand %xmm7, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm3, %xmm4
; SSSE3-NEXT: pand %xmm4, %xmm0
; SSSE3-NEXT: pandn %xmm8, %xmm4
; SSSE3-NEXT: por %xmm0, %xmm4
; SSSE3-NEXT: pxor %xmm1, %xmm6
; SSSE3-NEXT: movdqa %xmm9, %xmm0
; SSSE3-NEXT: pcmpgtd %xmm6, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
; SSSE3-NEXT: pand %xmm3, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSSE3-NEXT: por %xmm6, %xmm0
; SSSE3-NEXT: pand %xmm0, %xmm1
; SSSE3-NEXT: pandn %xmm8, %xmm0
; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7]
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_usat_v8i64_v8i16:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm8
-; SSE41-NEXT: movapd {{.*#+}} xmm9 = [65535,65535]
-; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: movapd {{.*#+}} xmm5 = [65535,65535]
+; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm7, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002324991,9223372039002324991]
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm9, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5
+; SSE41-NEXT: pxor %xmm6, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002324991,9223372039002324991]
+; SSE41-NEXT: movdqa %xmm9, %xmm7
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm7, %xmm0
+; SSE41-NEXT: movapd %xmm5, %xmm4
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4
; SSE41-NEXT: movdqa %xmm8, %xmm0
-; SSE41-NEXT: pxor %xmm7, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm9, %xmm1
+; SSE41-NEXT: pxor %xmm6, %xmm0
+; SSE41-NEXT: movdqa %xmm9, %xmm1
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: movapd %xmm5, %xmm1
; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1
-; SSE41-NEXT: packusdw %xmm5, %xmm1
+; SSE41-NEXT: packusdw %xmm4, %xmm1
; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm7, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm9, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5
-; SSE41-NEXT: pxor %xmm2, %xmm7
-; SSE41-NEXT: movdqa %xmm4, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm7, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: pxor %xmm6, %xmm0
+; SSE41-NEXT: movdqa %xmm9, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm7, %xmm0
; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm9
-; SSE41-NEXT: packusdw %xmm5, %xmm9
-; SSE41-NEXT: packusdw %xmm9, %xmm1
+; SSE41-NEXT: movapd %xmm5, %xmm4
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4
+; SSE41-NEXT: pxor %xmm2, %xmm6
+; SSE41-NEXT: movdqa %xmm9, %xmm3
+; SSE41-NEXT: pcmpgtd %xmm6, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm9, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5
+; SSE41-NEXT: packusdw %xmm4, %xmm5
+; SSE41-NEXT: packusdw %xmm5, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc_usat_v8i64_v8i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [65535,65535,65535,65535]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343]
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3
; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm6
; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm6
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3
; AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3
; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm4
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_usat_v8i64_v8i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm4
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854841343,9223372036854841343,9223372036854841343,9223372036854841343]
; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm3
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_usat_v8i64_v8i16:
; AVX512: # %bb.0:
; AVX512-NEXT: vpmovusqw %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = icmp ult <8 x i64> %a0, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
%2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
%3 = trunc <8 x i64> %2 to <8 x i16>
ret <8 x i16> %3
}
define <8 x i16> @trunc_usat_v8i32_v8i16(<8 x i32> %a0) {
; SSE2-LABEL: trunc_usat_v8i32_v8i16:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183]
; SSE2-NEXT: movdqa %xmm4, %xmm5
; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
; SSE2-NEXT: pand %xmm5, %xmm0
; SSE2-NEXT: pxor %xmm3, %xmm5
; SSE2-NEXT: por %xmm5, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm2
; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
; SSE2-NEXT: pxor %xmm4, %xmm3
; SSE2-NEXT: pand %xmm1, %xmm4
; SSE2-NEXT: por %xmm3, %xmm4
; SSE2-NEXT: pslld $16, %xmm4
; SSE2-NEXT: psrad $16, %xmm4
; SSE2-NEXT: pslld $16, %xmm0
; SSE2-NEXT: psrad $16, %xmm0
; SSE2-NEXT: packssdw %xmm4, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_usat_v8i32_v8i16:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm0, %xmm4
; SSSE3-NEXT: pxor %xmm3, %xmm4