Index: lib/Target/X86/X86Subtarget.h =================================================================== --- lib/Target/X86/X86Subtarget.h +++ lib/Target/X86/X86Subtarget.h @@ -407,8 +407,11 @@ /// features. unsigned PreferVectorWidth; - /// Required vector width from function attribute. - unsigned RequiredVectorWidth; + /// Legal vector width from function attribute. + unsigned LegalVectorWidthOverride; + + /// Legal vector width from function attribute. + unsigned LegalVectorWidth; /// True if compiling for 64-bit, false for 16-bit or 32-bit. bool In64BitMode; @@ -437,7 +440,7 @@ X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS, const X86TargetMachine &TM, unsigned StackAlignOverride, unsigned PreferVectorWidthOverride, - unsigned RequiredVectorWidth); + unsigned LegalVectorWidthOverride); const X86TargetLowering *getTargetLowering() const override { return &TLInfo; @@ -625,7 +628,6 @@ bool useRetpolineExternalThunk() const { return UseRetpolineExternalThunk; } unsigned getPreferVectorWidth() const { return PreferVectorWidth; } - unsigned getRequiredVectorWidth() const { return RequiredVectorWidth; } // Helper functions to determine when we should allow widening to 512-bit // during codegen. @@ -641,7 +643,7 @@ // If there are no 512-bit vectors and we prefer not to use 512-bit registers, // disable them in the legalizer. bool useAVX512Regs() const { - return hasAVX512() && (canExtendTo512DQ() || RequiredVectorWidth > 256); + return hasAVX512() && (canExtendTo512DQ() || LegalVectorWidth > 256); } bool useBWIRegs() const { Index: lib/Target/X86/X86Subtarget.cpp =================================================================== --- lib/Target/X86/X86Subtarget.cpp +++ lib/Target/X86/X86Subtarget.cpp @@ -52,6 +52,11 @@ X86EarlyIfConv("x86-early-ifcvt", cl::Hidden, cl::desc("Enable early if-conversion on X86")); +static cl::opt +LegalizeToPreferVectorWidth("x86-experimental-legalize-to-prefer-vector-width", + cl::desc("Legalize to the preferred vector width"), + cl::init(false), cl::Hidden); + /// Classify a blockaddress reference for the current subtarget according to how /// we should reference it in a non-pcrel context. @@ -263,6 +268,11 @@ PreferVectorWidth = PreferVectorWidthOverride; else if (Prefer256Bit) PreferVectorWidth = 256; + + if (LegalVectorWidthOverride) + LegalVectorWidth = LegalVectorWidthOverride; + else if (LegalizeToPreferVectorWidth) + LegalVectorWidth = PreferVectorWidth; } void X86Subtarget::initializeEnvironment() { @@ -363,6 +373,7 @@ GatherOverhead = 1024; ScatterOverhead = 1024; PreferVectorWidth = UINT32_MAX; + LegalVectorWidth = UINT32_MAX; Prefer256Bit = false; } @@ -377,12 +388,12 @@ const X86TargetMachine &TM, unsigned StackAlignOverride, unsigned PreferVectorWidthOverride, - unsigned RequiredVectorWidth) + unsigned LegalVectorWidthOverride) : X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others), PICStyle(PICStyles::None), TM(TM), TargetTriple(TT), StackAlignOverride(StackAlignOverride), PreferVectorWidthOverride(PreferVectorWidthOverride), - RequiredVectorWidth(RequiredVectorWidth), + LegalVectorWidthOverride(LegalVectorWidthOverride), In64BitMode(TargetTriple.getArch() == Triple::x86_64), In32BitMode(TargetTriple.getArch() == Triple::x86 && TargetTriple.getEnvironment() != Triple::CODE16), Index: lib/Target/X86/X86TargetMachine.cpp =================================================================== --- lib/Target/X86/X86TargetMachine.cpp +++ lib/Target/X86/X86TargetMachine.cpp @@ -275,15 +275,15 @@ } } - // Extract required-vector-width attribute. - unsigned RequiredVectorWidth = UINT32_MAX; - if (F.hasFnAttribute("required-vector-width")) { - StringRef Val = F.getFnAttribute("required-vector-width").getValueAsString(); + // Extract legal-vector-width attribute. + unsigned LegalVectorWidthOverride = 0; + if (F.hasFnAttribute("legal-vector-width")) { + StringRef Val = F.getFnAttribute("legal-vector-width").getValueAsString(); unsigned Width; if (!Val.getAsInteger(0, Width)) { - Key += ",required-vector-width="; + Key += ",legal-vector-width="; Key += Val; - RequiredVectorWidth = Width; + LegalVectorWidthOverride = Width; } } @@ -301,7 +301,7 @@ I = llvm::make_unique(TargetTriple, CPU, FS, *this, Options.StackAlignmentOverride, PreferVectorWidthOverride, - RequiredVectorWidth); + LegalVectorWidthOverride); } return I.get(); } Index: test/CodeGen/X86/legal-vector-width.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/legal-vector-width.ll @@ -0,0 +1,515 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw,avx512dq,prefer-256-bit -x86-experimental-legalize-to-prefer-vector-width=true | FileCheck %s --check-prefix=CHECK --check-prefix=LEGAL256 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw,avx512dq,prefer-256-bit | FileCheck %s --check-prefix=CHECK --check-prefix=LEGAL512 + +; This file primarily contains tests for specific places in X86ISelLowering.cpp that needed be made aware of the legalizer not allowing 512-bit vectors due to prefer-256-bit even though AVX512 is enabled. + +; The first two tests make sure the legal-vector-width attribute works. The remaining tests use a command lines to force different legalization configurations. + +define void @add256(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "legal-vector-width"="256" { +; CHECK-LABEL: add256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-NEXT: vpaddd (%rsi), %ymm0, %ymm0 +; CHECK-NEXT: vpaddd 32(%rsi), %ymm1, %ymm1 +; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %d = load <16 x i32>, <16 x i32>* %a + %e = load <16 x i32>, <16 x i32>* %b + %f = add <16 x i32> %d, %e + store <16 x i32> %f, <16 x i32>* %c + ret void +} + +define void @add512(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "legal-vector-width"="512" { +; CHECK-LABEL: add512: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpaddd (%rsi), %zmm0, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %d = load <16 x i32>, <16 x i32>* %a + %e = load <16 x i32>, <16 x i32>* %b + %f = add <16 x i32> %d, %e + store <16 x i32> %f, <16 x i32>* %c + ret void +} + +define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) { +; LEGAL256-LABEL: avg_v64i8: +; LEGAL256: # %bb.0: +; LEGAL256-NEXT: vmovdqa (%rsi), %ymm0 +; LEGAL256-NEXT: vmovdqa 32(%rsi), %ymm1 +; LEGAL256-NEXT: vpavgb (%rdi), %ymm0, %ymm0 +; LEGAL256-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1 +; LEGAL256-NEXT: vmovdqu %ymm1, (%rax) +; LEGAL256-NEXT: vmovdqu %ymm0, (%rax) +; LEGAL256-NEXT: vzeroupper +; LEGAL256-NEXT: retq +; +; LEGAL512-LABEL: avg_v64i8: +; LEGAL512: # %bb.0: +; LEGAL512-NEXT: vmovdqa64 (%rsi), %zmm0 +; LEGAL512-NEXT: vpavgb (%rdi), %zmm0, %zmm0 +; LEGAL512-NEXT: vmovdqu64 %zmm0, (%rax) +; LEGAL512-NEXT: vzeroupper +; LEGAL512-NEXT: retq + %1 = load <64 x i8>, <64 x i8>* %a + %2 = load <64 x i8>, <64 x i8>* %b + %3 = zext <64 x i8> %1 to <64 x i32> + %4 = zext <64 x i8> %2 to <64 x i32> + %5 = add nuw nsw <64 x i32> %3, + %6 = add nuw nsw <64 x i32> %5, %4 + %7 = lshr <64 x i32> %6, + %8 = trunc <64 x i32> %7 to <64 x i8> + store <64 x i8> %8, <64 x i8>* undef, align 4 + ret void +} + +define void @pmaddwd_32(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) { +; LEGAL256-LABEL: pmaddwd_32: +; LEGAL256: # %bb.0: +; LEGAL256-NEXT: vmovdqa (%rdi), %ymm0 +; LEGAL256-NEXT: vmovdqa 32(%rdi), %ymm1 +; LEGAL256-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 +; LEGAL256-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 +; LEGAL256-NEXT: vmovdqa %ymm1, 32(%rdx) +; LEGAL256-NEXT: vmovdqa %ymm0, (%rdx) +; LEGAL256-NEXT: vzeroupper +; LEGAL256-NEXT: retq +; +; LEGAL512-LABEL: pmaddwd_32: +; LEGAL512: # %bb.0: +; LEGAL512-NEXT: vmovdqa64 (%rdi), %zmm0 +; LEGAL512-NEXT: vpmaddwd (%rsi), %zmm0, %zmm0 +; LEGAL512-NEXT: vmovdqa64 %zmm0, (%rdx) +; LEGAL512-NEXT: vzeroupper +; LEGAL512-NEXT: retq + %A = load <32 x i16>, <32 x i16>* %APtr + %B = load <32 x i16>, <32 x i16>* %BPtr + %a = sext <32 x i16> %A to <32 x i32> + %b = sext <32 x i16> %B to <32 x i32> + %m = mul nsw <32 x i32> %a, %b + %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> + %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> + %ret = add <16 x i32> %odd, %even + store <16 x i32> %ret, <16 x i32>* %CPtr + ret void +} + +define void @psubus_64i8_max(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) { +; LEGAL256-LABEL: psubus_64i8_max: +; LEGAL256: # %bb.0: +; LEGAL256-NEXT: vmovdqa (%rdi), %ymm0 +; LEGAL256-NEXT: vmovdqa 32(%rdi), %ymm1 +; LEGAL256-NEXT: vpsubusb (%rsi), %ymm0, %ymm0 +; LEGAL256-NEXT: vpsubusb 32(%rsi), %ymm1, %ymm1 +; LEGAL256-NEXT: vmovdqa %ymm1, 32(%rdx) +; LEGAL256-NEXT: vmovdqa %ymm0, (%rdx) +; LEGAL256-NEXT: vzeroupper +; LEGAL256-NEXT: retq +; +; LEGAL512-LABEL: psubus_64i8_max: +; LEGAL512: # %bb.0: +; LEGAL512-NEXT: vmovdqa64 (%rdi), %zmm0 +; LEGAL512-NEXT: vpsubusb (%rsi), %zmm0, %zmm0 +; LEGAL512-NEXT: vmovdqa64 %zmm0, (%rdx) +; LEGAL512-NEXT: vzeroupper +; LEGAL512-NEXT: retq + %x = load <64 x i8>, <64 x i8>* %xptr + %y = load <64 x i8>, <64 x i8>* %yptr + %cmp = icmp ult <64 x i8> %x, %y + %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x + %res = sub <64 x i8> %max, %y + store <64 x i8> %res, <64 x i8>* %zptr + ret void +} + +define i32 @_Z9test_charPcS_i(i8* nocapture readonly, i8* nocapture readonly, i32) { +; LEGAL256-LABEL: _Z9test_charPcS_i: +; LEGAL256: # %bb.0: # %entry +; LEGAL256-NEXT: movl %edx, %eax +; LEGAL256-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; LEGAL256-NEXT: xorl %ecx, %ecx +; LEGAL256-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; LEGAL256-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; LEGAL256-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; LEGAL256-NEXT: .p2align 4, 0x90 +; LEGAL256-NEXT: .LBB5_1: # %vector.body +; LEGAL256-NEXT: # =>This Inner Loop Header: Depth=1 +; LEGAL256-NEXT: vpmovsxbw (%rdi,%rcx), %xmm4 +; LEGAL256-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm5 +; LEGAL256-NEXT: vpmovsxbw 16(%rdi,%rcx), %xmm6 +; LEGAL256-NEXT: vpmovsxbw 24(%rdi,%rcx), %xmm8 +; LEGAL256-NEXT: vpmovsxbw (%rsi,%rcx), %xmm7 +; LEGAL256-NEXT: vpmaddwd %xmm4, %xmm7, %xmm4 +; LEGAL256-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm7 +; LEGAL256-NEXT: vpmaddwd %xmm5, %xmm7, %xmm5 +; LEGAL256-NEXT: vpmovsxbw 16(%rsi,%rcx), %xmm7 +; LEGAL256-NEXT: vpmaddwd %xmm6, %xmm7, %xmm6 +; LEGAL256-NEXT: vpmovsxbw 24(%rsi,%rcx), %xmm7 +; LEGAL256-NEXT: vpmaddwd %xmm8, %xmm7, %xmm7 +; LEGAL256-NEXT: vpaddd %ymm3, %ymm7, %ymm3 +; LEGAL256-NEXT: vpaddd %ymm2, %ymm6, %ymm2 +; LEGAL256-NEXT: vpaddd %ymm1, %ymm5, %ymm1 +; LEGAL256-NEXT: vpaddd %ymm0, %ymm4, %ymm0 +; LEGAL256-NEXT: addq $32, %rcx +; LEGAL256-NEXT: cmpq %rcx, %rax +; LEGAL256-NEXT: jne .LBB5_1 +; LEGAL256-NEXT: # %bb.2: # %middle.block +; LEGAL256-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; LEGAL256-NEXT: vpaddd %ymm3, %ymm1, %ymm1 +; LEGAL256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; LEGAL256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; LEGAL256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; LEGAL256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; LEGAL256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; LEGAL256-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; LEGAL256-NEXT: vmovd %xmm0, %eax +; LEGAL256-NEXT: vzeroupper +; LEGAL256-NEXT: retq +; +; LEGAL512-LABEL: _Z9test_charPcS_i: +; LEGAL512: # %bb.0: # %entry +; LEGAL512-NEXT: movl %edx, %eax +; LEGAL512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; LEGAL512-NEXT: xorl %ecx, %ecx +; LEGAL512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; LEGAL512-NEXT: .p2align 4, 0x90 +; LEGAL512-NEXT: .LBB5_1: # %vector.body +; LEGAL512-NEXT: # =>This Inner Loop Header: Depth=1 +; LEGAL512-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2 +; LEGAL512-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3 +; LEGAL512-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2 +; LEGAL512-NEXT: vpaddd %zmm1, %zmm2, %zmm1 +; LEGAL512-NEXT: addq $32, %rcx +; LEGAL512-NEXT: cmpq %rcx, %rax +; LEGAL512-NEXT: jne .LBB5_1 +; LEGAL512-NEXT: # %bb.2: # %middle.block +; LEGAL512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; LEGAL512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; LEGAL512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; LEGAL512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; LEGAL512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; LEGAL512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; LEGAL512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; LEGAL512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; LEGAL512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; LEGAL512-NEXT: vmovd %xmm0, %eax +; LEGAL512-NEXT: vzeroupper +; LEGAL512-NEXT: retq +entry: + %3 = zext i32 %2 to i64 + br label %vector.body + +vector.body: + %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] + %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] + %4 = getelementptr inbounds i8, i8* %0, i64 %index + %5 = bitcast i8* %4 to <32 x i8>* + %wide.load = load <32 x i8>, <32 x i8>* %5, align 1 + %6 = sext <32 x i8> %wide.load to <32 x i32> + %7 = getelementptr inbounds i8, i8* %1, i64 %index + %8 = bitcast i8* %7 to <32 x i8>* + %wide.load14 = load <32 x i8>, <32 x i8>* %8, align 1 + %9 = sext <32 x i8> %wide.load14 to <32 x i32> + %10 = mul nsw <32 x i32> %9, %6 + %11 = add nsw <32 x i32> %10, %vec.phi + %index.next = add i64 %index, 32 + %12 = icmp eq i64 %index.next, %3 + br i1 %12, label %middle.block, label %vector.body + +middle.block: + %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> + %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1 + %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> + %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf + %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> + %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15 + %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> + %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17 + %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> + %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19 + %13 = extractelement <32 x i32> %bin.rdx20, i32 0 + ret i32 %13 +} + +@a = global [1024 x i8] zeroinitializer, align 16 +@b = global [1024 x i8] zeroinitializer, align 16 + +define i32 @sad_16i8() { +; LEGAL256-LABEL: sad_16i8: +; LEGAL256: # %bb.0: # %entry +; LEGAL256-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; LEGAL256-NEXT: movq $-1024, %rax # imm = 0xFC00 +; LEGAL256-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; LEGAL256-NEXT: .p2align 4, 0x90 +; LEGAL256-NEXT: .LBB6_1: # %vector.body +; LEGAL256-NEXT: # =>This Inner Loop Header: Depth=1 +; LEGAL256-NEXT: vmovdqu a+1024(%rax), %xmm2 +; LEGAL256-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2 +; LEGAL256-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; LEGAL256-NEXT: addq $4, %rax +; LEGAL256-NEXT: jne .LBB6_1 +; LEGAL256-NEXT: # %bb.2: # %middle.block +; LEGAL256-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; LEGAL256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; LEGAL256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; LEGAL256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; LEGAL256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; LEGAL256-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; LEGAL256-NEXT: vmovd %xmm0, %eax +; LEGAL256-NEXT: vzeroupper +; LEGAL256-NEXT: retq +; +; LEGAL512-LABEL: sad_16i8: +; LEGAL512: # %bb.0: # %entry +; LEGAL512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; LEGAL512-NEXT: movq $-1024, %rax # imm = 0xFC00 +; LEGAL512-NEXT: .p2align 4, 0x90 +; LEGAL512-NEXT: .LBB6_1: # %vector.body +; LEGAL512-NEXT: # =>This Inner Loop Header: Depth=1 +; LEGAL512-NEXT: vmovdqu a+1024(%rax), %xmm1 +; LEGAL512-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1 +; LEGAL512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; LEGAL512-NEXT: addq $4, %rax +; LEGAL512-NEXT: jne .LBB6_1 +; LEGAL512-NEXT: # %bb.2: # %middle.block +; LEGAL512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; LEGAL512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; LEGAL512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; LEGAL512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; LEGAL512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; LEGAL512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; LEGAL512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; LEGAL512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; LEGAL512-NEXT: vmovd %xmm0, %eax +; LEGAL512-NEXT: vzeroupper +; LEGAL512-NEXT: retq +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] + %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index + %1 = bitcast i8* %0 to <16 x i8>* + %wide.load = load <16 x i8>, <16 x i8>* %1, align 4 + %2 = zext <16 x i8> %wide.load to <16 x i32> + %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index + %4 = bitcast i8* %3 to <16 x i8>* + %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4 + %5 = zext <16 x i8> %wide.load1 to <16 x i32> + %6 = sub nsw <16 x i32> %2, %5 + %7 = icmp sgt <16 x i32> %6, + %8 = sub nsw <16 x i32> zeroinitializer, %6 + %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8 + %10 = add nsw <16 x i32> %9, %vec.phi + %index.next = add i64 %index, 4 + %11 = icmp eq i64 %index.next, 1024 + br i1 %11, label %middle.block, label %vector.body + +middle.block: + %.lcssa = phi <16 x i32> [ %10, %vector.body ] + %rdx.shuf = shufflevector <16 x i32> %.lcssa, <16 x i32> undef, <16 x i32> + %bin.rdx = add <16 x i32> %.lcssa, %rdx.shuf + %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> + %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2 + %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> + %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3 + %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> + %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4 + %12 = extractelement <16 x i32> %bin.rdx4, i32 0 + ret i32 %12 +} + +define void @sbto16f32(<16 x i16> %a, <16 x float>* %res) { +; LEGAL256-LABEL: sbto16f32: +; LEGAL256: # %bb.0: +; LEGAL256-NEXT: vpmovw2m %ymm0, %k0 +; LEGAL256-NEXT: kshiftrw $8, %k0, %k1 +; LEGAL256-NEXT: vpmovm2d %k1, %ymm0 +; LEGAL256-NEXT: vcvtdq2ps %ymm0, %ymm0 +; LEGAL256-NEXT: vpmovm2d %k0, %ymm1 +; LEGAL256-NEXT: vcvtdq2ps %ymm1, %ymm1 +; LEGAL256-NEXT: vmovaps %ymm1, (%rdi) +; LEGAL256-NEXT: vmovaps %ymm0, 32(%rdi) +; LEGAL256-NEXT: vzeroupper +; LEGAL256-NEXT: retq +; +; LEGAL512-LABEL: sbto16f32: +; LEGAL512: # %bb.0: +; LEGAL512-NEXT: vpmovw2m %ymm0, %k0 +; LEGAL512-NEXT: vpmovm2d %k0, %zmm0 +; LEGAL512-NEXT: vcvtdq2ps %zmm0, %zmm0 +; LEGAL512-NEXT: vmovaps %zmm0, (%rdi) +; LEGAL512-NEXT: vzeroupper +; LEGAL512-NEXT: retq + %mask = icmp slt <16 x i16> %a, zeroinitializer + %1 = sitofp <16 x i1> %mask to <16 x float> + store <16 x float> %1, <16 x float>* %res + ret void +} + +define void @sbto16f64(<16 x i16> %a, <16 x double>* %res) { +; LEGAL256-LABEL: sbto16f64: +; LEGAL256: # %bb.0: +; LEGAL256-NEXT: vpmovw2m %ymm0, %k0 +; LEGAL256-NEXT: kshiftrw $8, %k0, %k1 +; LEGAL256-NEXT: vpmovm2d %k1, %ymm0 +; LEGAL256-NEXT: vcvtdq2pd %xmm0, %ymm1 +; LEGAL256-NEXT: vextracti128 $1, %ymm0, %xmm0 +; LEGAL256-NEXT: vcvtdq2pd %xmm0, %ymm0 +; LEGAL256-NEXT: vpmovm2d %k0, %ymm2 +; LEGAL256-NEXT: vcvtdq2pd %xmm2, %ymm3 +; LEGAL256-NEXT: vextracti128 $1, %ymm2, %xmm2 +; LEGAL256-NEXT: vcvtdq2pd %xmm2, %ymm2 +; LEGAL256-NEXT: vmovaps %ymm2, 32(%rdi) +; LEGAL256-NEXT: vmovaps %ymm3, (%rdi) +; LEGAL256-NEXT: vmovaps %ymm0, 96(%rdi) +; LEGAL256-NEXT: vmovaps %ymm1, 64(%rdi) +; LEGAL256-NEXT: vzeroupper +; LEGAL256-NEXT: retq +; +; LEGAL512-LABEL: sbto16f64: +; LEGAL512: # %bb.0: +; LEGAL512-NEXT: vpmovw2m %ymm0, %k0 +; LEGAL512-NEXT: vpmovm2d %k0, %zmm0 +; LEGAL512-NEXT: vcvtdq2pd %ymm0, %zmm1 +; LEGAL512-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; LEGAL512-NEXT: vcvtdq2pd %ymm0, %zmm0 +; LEGAL512-NEXT: vmovaps %zmm0, 64(%rdi) +; LEGAL512-NEXT: vmovaps %zmm1, (%rdi) +; LEGAL512-NEXT: vzeroupper +; LEGAL512-NEXT: retq + %mask = icmp slt <16 x i16> %a, zeroinitializer + %1 = sitofp <16 x i1> %mask to <16 x double> + store <16 x double> %1, <16 x double>* %res + ret void +} + +define void @ubto16f32_256(<16 x i16> %a, <16 x float>* %res) { +; LEGAL256-LABEL: ubto16f32_256: +; LEGAL256: # %bb.0: +; LEGAL256-NEXT: vpmovw2m %ymm0, %k0 +; LEGAL256-NEXT: kshiftrw $8, %k0, %k1 +; LEGAL256-NEXT: vpmovm2d %k1, %ymm0 +; LEGAL256-NEXT: vpsrld $31, %ymm0, %ymm0 +; LEGAL256-NEXT: vcvtdq2ps %ymm0, %ymm0 +; LEGAL256-NEXT: vpmovm2d %k0, %ymm1 +; LEGAL256-NEXT: vpsrld $31, %ymm1, %ymm1 +; LEGAL256-NEXT: vcvtdq2ps %ymm1, %ymm1 +; LEGAL256-NEXT: vmovaps %ymm1, (%rdi) +; LEGAL256-NEXT: vmovaps %ymm0, 32(%rdi) +; LEGAL256-NEXT: vzeroupper +; LEGAL256-NEXT: retq +; +; LEGAL512-LABEL: ubto16f32_256: +; LEGAL512: # %bb.0: +; LEGAL512-NEXT: vpmovw2m %ymm0, %k0 +; LEGAL512-NEXT: vpmovm2d %k0, %zmm0 +; LEGAL512-NEXT: vpsrld $31, %zmm0, %zmm0 +; LEGAL512-NEXT: vcvtdq2ps %zmm0, %zmm0 +; LEGAL512-NEXT: vmovaps %zmm0, (%rdi) +; LEGAL512-NEXT: vzeroupper +; LEGAL512-NEXT: retq + %mask = icmp slt <16 x i16> %a, zeroinitializer + %1 = uitofp <16 x i1> %mask to <16 x float> + store <16 x float> %1, <16 x float>* %res + ret void +} + +define void @ubto16f64(<16 x i16> %a, <16 x double>* %res) { +; LEGAL256-LABEL: ubto16f64: +; LEGAL256: # %bb.0: +; LEGAL256-NEXT: vpmovw2m %ymm0, %k0 +; LEGAL256-NEXT: kshiftrw $8, %k0, %k1 +; LEGAL256-NEXT: vpmovm2d %k1, %ymm0 +; LEGAL256-NEXT: vpsrld $31, %ymm0, %ymm0 +; LEGAL256-NEXT: vcvtdq2pd %xmm0, %ymm1 +; LEGAL256-NEXT: vextracti128 $1, %ymm0, %xmm0 +; LEGAL256-NEXT: vcvtdq2pd %xmm0, %ymm0 +; LEGAL256-NEXT: vpmovm2d %k0, %ymm2 +; LEGAL256-NEXT: vpsrld $31, %ymm2, %ymm2 +; LEGAL256-NEXT: vcvtdq2pd %xmm2, %ymm3 +; LEGAL256-NEXT: vextracti128 $1, %ymm2, %xmm2 +; LEGAL256-NEXT: vcvtdq2pd %xmm2, %ymm2 +; LEGAL256-NEXT: vmovaps %ymm2, 32(%rdi) +; LEGAL256-NEXT: vmovaps %ymm3, (%rdi) +; LEGAL256-NEXT: vmovaps %ymm0, 96(%rdi) +; LEGAL256-NEXT: vmovaps %ymm1, 64(%rdi) +; LEGAL256-NEXT: vzeroupper +; LEGAL256-NEXT: retq +; +; LEGAL512-LABEL: ubto16f64: +; LEGAL512: # %bb.0: +; LEGAL512-NEXT: vpmovw2m %ymm0, %k0 +; LEGAL512-NEXT: vpmovm2d %k0, %zmm0 +; LEGAL512-NEXT: vpsrld $31, %zmm0, %zmm0 +; LEGAL512-NEXT: vcvtdq2pd %ymm0, %zmm1 +; LEGAL512-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; LEGAL512-NEXT: vcvtdq2pd %ymm0, %zmm0 +; LEGAL512-NEXT: vmovaps %zmm0, 64(%rdi) +; LEGAL512-NEXT: vmovaps %zmm1, (%rdi) +; LEGAL512-NEXT: vzeroupper +; LEGAL512-NEXT: retq + %mask = icmp slt <16 x i16> %a, zeroinitializer + %1 = uitofp <16 x i1> %mask to <16 x double> + store <16 x double> %1, <16 x double>* %res + ret void +} + +define <16 x i16> @test_16f32toub(<16 x float>* %ptr, <16 x i16> %passthru) { +; LEGAL256-LABEL: test_16f32toub: +; LEGAL256: # %bb.0: +; LEGAL256-NEXT: vcvttps2dq (%rdi), %ymm1 +; LEGAL256-NEXT: vpmovdw %ymm1, %xmm1 +; LEGAL256-NEXT: vcvttps2dq 32(%rdi), %ymm2 +; LEGAL256-NEXT: vpmovdw %ymm2, %xmm2 +; LEGAL256-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; LEGAL256-NEXT: vpsllw $15, %ymm1, %ymm1 +; LEGAL256-NEXT: vpmovw2m %ymm1, %k1 +; LEGAL256-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} +; LEGAL256-NEXT: retq +; +; LEGAL512-LABEL: test_16f32toub: +; LEGAL512: # %bb.0: +; LEGAL512-NEXT: vcvttps2dq (%rdi), %zmm1 +; LEGAL512-NEXT: vpslld $31, %zmm1, %zmm1 +; LEGAL512-NEXT: vpmovd2m %zmm1, %k1 +; LEGAL512-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} +; LEGAL512-NEXT: retq + %a = load <16 x float>, <16 x float>* %ptr + %mask = fptoui <16 x float> %a to <16 x i1> + %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer + ret <16 x i16> %select +} + +define <16 x i16> @test_16f32tosb(<16 x float>* %ptr, <16 x i16> %passthru) { +; LEGAL256-LABEL: test_16f32tosb: +; LEGAL256: # %bb.0: +; LEGAL256-NEXT: vcvttps2dq (%rdi), %ymm1 +; LEGAL256-NEXT: vpmovdw %ymm1, %xmm1 +; LEGAL256-NEXT: vcvttps2dq 32(%rdi), %ymm2 +; LEGAL256-NEXT: vpmovdw %ymm2, %xmm2 +; LEGAL256-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; LEGAL256-NEXT: vpsllw $15, %ymm1, %ymm1 +; LEGAL256-NEXT: vpmovw2m %ymm1, %k1 +; LEGAL256-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} +; LEGAL256-NEXT: retq +; +; LEGAL512-LABEL: test_16f32tosb: +; LEGAL512: # %bb.0: +; LEGAL512-NEXT: vcvttps2dq (%rdi), %zmm1 +; LEGAL512-NEXT: vpmovd2m %zmm1, %k1 +; LEGAL512-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} +; LEGAL512-NEXT: retq + %a = load <16 x float>, <16 x float>* %ptr + %mask = fptosi <16 x float> %a to <16 x i1> + %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer + ret <16 x i16> %select +} Index: test/CodeGen/X86/required-vector-width.ll =================================================================== --- test/CodeGen/X86/required-vector-width.ll +++ /dev/null @@ -1,655 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw,avx512dq,prefer-256-bit | FileCheck %s - -; This file primarily contains tests for specific places in X86ISelLowering.cpp that needed be made aware of the legalizer not allowing 512-bit vectors due to prefer-256-bit even though AVX512 is enabled. - -define void @add256(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "required-vector-width"="256" { -; CHECK-LABEL: add256: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm0 -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vpaddd (%rsi), %ymm0, %ymm0 -; CHECK-NEXT: vpaddd 32(%rsi), %ymm1, %ymm1 -; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) -; CHECK-NEXT: vmovdqa %ymm0, (%rdx) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq - %d = load <16 x i32>, <16 x i32>* %a - %e = load <16 x i32>, <16 x i32>* %b - %f = add <16 x i32> %d, %e - store <16 x i32> %f, <16 x i32>* %c - ret void -} - -define void @add512(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "required-vector-width"="512" { -; CHECK-LABEL: add512: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 -; CHECK-NEXT: vpaddd (%rsi), %zmm0, %zmm0 -; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq - %d = load <16 x i32>, <16 x i32>* %a - %e = load <16 x i32>, <16 x i32>* %b - %f = add <16 x i32> %d, %e - store <16 x i32> %f, <16 x i32>* %c - ret void -} - -define void @avg_v64i8_256(<64 x i8>* %a, <64 x i8>* %b) "required-vector-width"="256" { -; CHECK-LABEL: avg_v64i8_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rsi), %ymm0 -; CHECK-NEXT: vmovdqa 32(%rsi), %ymm1 -; CHECK-NEXT: vpavgb (%rdi), %ymm0, %ymm0 -; CHECK-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1 -; CHECK-NEXT: vmovdqu %ymm1, (%rax) -; CHECK-NEXT: vmovdqu %ymm0, (%rax) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq - %1 = load <64 x i8>, <64 x i8>* %a - %2 = load <64 x i8>, <64 x i8>* %b - %3 = zext <64 x i8> %1 to <64 x i32> - %4 = zext <64 x i8> %2 to <64 x i32> - %5 = add nuw nsw <64 x i32> %3, - %6 = add nuw nsw <64 x i32> %5, %4 - %7 = lshr <64 x i32> %6, - %8 = trunc <64 x i32> %7 to <64 x i8> - store <64 x i8> %8, <64 x i8>* undef, align 4 - ret void -} - - -define void @avg_v64i8_512(<64 x i8>* %a, <64 x i8>* %b) "required-vector-width"="512" { -; CHECK-LABEL: avg_v64i8_512: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rsi), %zmm0 -; CHECK-NEXT: vpavgb (%rdi), %zmm0, %zmm0 -; CHECK-NEXT: vmovdqu64 %zmm0, (%rax) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq - %1 = load <64 x i8>, <64 x i8>* %a - %2 = load <64 x i8>, <64 x i8>* %b - %3 = zext <64 x i8> %1 to <64 x i32> - %4 = zext <64 x i8> %2 to <64 x i32> - %5 = add nuw nsw <64 x i32> %3, - %6 = add nuw nsw <64 x i32> %5, %4 - %7 = lshr <64 x i32> %6, - %8 = trunc <64 x i32> %7 to <64 x i8> - store <64 x i8> %8, <64 x i8>* undef, align 4 - ret void -} - -define void @pmaddwd_32_256(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "required-vector-width"="256" { -; CHECK-LABEL: pmaddwd_32_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm0 -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 -; CHECK-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 -; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) -; CHECK-NEXT: vmovdqa %ymm0, (%rdx) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq - %A = load <32 x i16>, <32 x i16>* %APtr - %B = load <32 x i16>, <32 x i16>* %BPtr - %a = sext <32 x i16> %A to <32 x i32> - %b = sext <32 x i16> %B to <32 x i32> - %m = mul nsw <32 x i32> %a, %b - %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> - %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> - %ret = add <16 x i32> %odd, %even - store <16 x i32> %ret, <16 x i32>* %CPtr - ret void -} - -define void @pmaddwd_32_512(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "required-vector-width"="512" { -; CHECK-LABEL: pmaddwd_32_512: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 -; CHECK-NEXT: vpmaddwd (%rsi), %zmm0, %zmm0 -; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq - %A = load <32 x i16>, <32 x i16>* %APtr - %B = load <32 x i16>, <32 x i16>* %BPtr - %a = sext <32 x i16> %A to <32 x i32> - %b = sext <32 x i16> %B to <32 x i32> - %m = mul nsw <32 x i32> %a, %b - %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> - %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> - %ret = add <16 x i32> %odd, %even - store <16 x i32> %ret, <16 x i32>* %CPtr - ret void -} - -define void @psubus_64i8_max_256(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) "required-vector-width"="256" { -; CHECK-LABEL: psubus_64i8_max_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm0 -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vpsubusb (%rsi), %ymm0, %ymm0 -; CHECK-NEXT: vpsubusb 32(%rsi), %ymm1, %ymm1 -; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) -; CHECK-NEXT: vmovdqa %ymm0, (%rdx) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq - %x = load <64 x i8>, <64 x i8>* %xptr - %y = load <64 x i8>, <64 x i8>* %yptr - %cmp = icmp ult <64 x i8> %x, %y - %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x - %res = sub <64 x i8> %max, %y - store <64 x i8> %res, <64 x i8>* %zptr - ret void -} - -define void @psubus_64i8_max_512(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) "required-vector-width"="512" { -; CHECK-LABEL: psubus_64i8_max_512: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 -; CHECK-NEXT: vpsubusb (%rsi), %zmm0, %zmm0 -; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq - %x = load <64 x i8>, <64 x i8>* %xptr - %y = load <64 x i8>, <64 x i8>* %yptr - %cmp = icmp ult <64 x i8> %x, %y - %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x - %res = sub <64 x i8> %max, %y - store <64 x i8> %res, <64 x i8>* %zptr - ret void -} - -define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly, i32) "required-vector-width"="256" { -; CHECK-LABEL: _Z9test_charPcS_i_256: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl %edx, %eax -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB8_1: # %vector.body -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpmovsxbw (%rdi,%rcx), %xmm4 -; CHECK-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm5 -; CHECK-NEXT: vpmovsxbw 16(%rdi,%rcx), %xmm6 -; CHECK-NEXT: vpmovsxbw 24(%rdi,%rcx), %xmm8 -; CHECK-NEXT: vpmovsxbw (%rsi,%rcx), %xmm7 -; CHECK-NEXT: vpmaddwd %xmm4, %xmm7, %xmm4 -; CHECK-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm7 -; CHECK-NEXT: vpmaddwd %xmm5, %xmm7, %xmm5 -; CHECK-NEXT: vpmovsxbw 16(%rsi,%rcx), %xmm7 -; CHECK-NEXT: vpmaddwd %xmm6, %xmm7, %xmm6 -; CHECK-NEXT: vpmovsxbw 24(%rsi,%rcx), %xmm7 -; CHECK-NEXT: vpmaddwd %xmm8, %xmm7, %xmm7 -; CHECK-NEXT: vpaddd %ymm3, %ymm7, %ymm3 -; CHECK-NEXT: vpaddd %ymm2, %ymm6, %ymm2 -; CHECK-NEXT: vpaddd %ymm1, %ymm5, %ymm1 -; CHECK-NEXT: vpaddd %ymm0, %ymm4, %ymm0 -; CHECK-NEXT: addq $32, %rcx -; CHECK-NEXT: cmpq %rcx, %rax -; CHECK-NEXT: jne .LBB8_1 -; CHECK-NEXT: # %bb.2: # %middle.block -; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; CHECK-NEXT: vpaddd %ymm3, %ymm1, %ymm1 -; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vphaddd %ymm0, %ymm0, %ymm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq -entry: - %3 = zext i32 %2 to i64 - br label %vector.body - -vector.body: - %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] - %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] - %4 = getelementptr inbounds i8, i8* %0, i64 %index - %5 = bitcast i8* %4 to <32 x i8>* - %wide.load = load <32 x i8>, <32 x i8>* %5, align 1 - %6 = sext <32 x i8> %wide.load to <32 x i32> - %7 = getelementptr inbounds i8, i8* %1, i64 %index - %8 = bitcast i8* %7 to <32 x i8>* - %wide.load14 = load <32 x i8>, <32 x i8>* %8, align 1 - %9 = sext <32 x i8> %wide.load14 to <32 x i32> - %10 = mul nsw <32 x i32> %9, %6 - %11 = add nsw <32 x i32> %10, %vec.phi - %index.next = add i64 %index, 32 - %12 = icmp eq i64 %index.next, %3 - br i1 %12, label %middle.block, label %vector.body - -middle.block: - %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> - %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1 - %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> - %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf - %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> - %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15 - %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> - %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17 - %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> - %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19 - %13 = extractelement <32 x i32> %bin.rdx20, i32 0 - ret i32 %13 -} - -define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly, i32) "required-vector-width"="512" { -; CHECK-LABEL: _Z9test_charPcS_i_512: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl %edx, %eax -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB9_1: # %vector.body -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2 -; CHECK-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3 -; CHECK-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2 -; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1 -; CHECK-NEXT: addq $32, %rcx -; CHECK-NEXT: cmpq %rcx, %rax -; CHECK-NEXT: jne .LBB9_1 -; CHECK-NEXT: # %bb.2: # %middle.block -; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq -entry: - %3 = zext i32 %2 to i64 - br label %vector.body - -vector.body: - %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] - %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] - %4 = getelementptr inbounds i8, i8* %0, i64 %index - %5 = bitcast i8* %4 to <32 x i8>* - %wide.load = load <32 x i8>, <32 x i8>* %5, align 1 - %6 = sext <32 x i8> %wide.load to <32 x i32> - %7 = getelementptr inbounds i8, i8* %1, i64 %index - %8 = bitcast i8* %7 to <32 x i8>* - %wide.load14 = load <32 x i8>, <32 x i8>* %8, align 1 - %9 = sext <32 x i8> %wide.load14 to <32 x i32> - %10 = mul nsw <32 x i32> %9, %6 - %11 = add nsw <32 x i32> %10, %vec.phi - %index.next = add i64 %index, 32 - %12 = icmp eq i64 %index.next, %3 - br i1 %12, label %middle.block, label %vector.body - -middle.block: - %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> - %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1 - %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> - %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf - %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> - %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15 - %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> - %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17 - %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> - %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19 - %13 = extractelement <32 x i32> %bin.rdx20, i32 0 - ret i32 %13 -} - -@a = global [1024 x i8] zeroinitializer, align 16 -@b = global [1024 x i8] zeroinitializer, align 16 - -define i32 @sad_16i8_256() "required-vector-width"="256" { -; CHECK-LABEL: sad_16i8_256: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: movq $-1024, %rax # imm = 0xFC00 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB10_1: # %vector.body -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu a+1024(%rax), %xmm2 -; CHECK-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2 -; CHECK-NEXT: vpaddd %ymm1, %ymm2, %ymm1 -; CHECK-NEXT: addq $4, %rax -; CHECK-NEXT: jne .LBB10_1 -; CHECK-NEXT: # %bb.2: # %middle.block -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vphaddd %ymm0, %ymm0, %ymm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq -entry: - br label %vector.body - -vector.body: - %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] - %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] - %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index - %1 = bitcast i8* %0 to <16 x i8>* - %wide.load = load <16 x i8>, <16 x i8>* %1, align 4 - %2 = zext <16 x i8> %wide.load to <16 x i32> - %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index - %4 = bitcast i8* %3 to <16 x i8>* - %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4 - %5 = zext <16 x i8> %wide.load1 to <16 x i32> - %6 = sub nsw <16 x i32> %2, %5 - %7 = icmp sgt <16 x i32> %6, - %8 = sub nsw <16 x i32> zeroinitializer, %6 - %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8 - %10 = add nsw <16 x i32> %9, %vec.phi - %index.next = add i64 %index, 4 - %11 = icmp eq i64 %index.next, 1024 - br i1 %11, label %middle.block, label %vector.body - -middle.block: - %.lcssa = phi <16 x i32> [ %10, %vector.body ] - %rdx.shuf = shufflevector <16 x i32> %.lcssa, <16 x i32> undef, <16 x i32> - %bin.rdx = add <16 x i32> %.lcssa, %rdx.shuf - %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> - %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2 - %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> - %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3 - %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> - %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4 - %12 = extractelement <16 x i32> %bin.rdx4, i32 0 - ret i32 %12 -} - -define i32 @sad_16i8_512() "required-vector-width"="512" { -; CHECK-LABEL: sad_16i8_512: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: movq $-1024, %rax # imm = 0xFC00 -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB11_1: # %vector.body -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu a+1024(%rax), %xmm1 -; CHECK-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1 -; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: addq $4, %rax -; CHECK-NEXT: jne .LBB11_1 -; CHECK-NEXT: # %bb.2: # %middle.block -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq -entry: - br label %vector.body - -vector.body: - %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] - %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] - %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index - %1 = bitcast i8* %0 to <16 x i8>* - %wide.load = load <16 x i8>, <16 x i8>* %1, align 4 - %2 = zext <16 x i8> %wide.load to <16 x i32> - %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index - %4 = bitcast i8* %3 to <16 x i8>* - %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4 - %5 = zext <16 x i8> %wide.load1 to <16 x i32> - %6 = sub nsw <16 x i32> %2, %5 - %7 = icmp sgt <16 x i32> %6, - %8 = sub nsw <16 x i32> zeroinitializer, %6 - %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8 - %10 = add nsw <16 x i32> %9, %vec.phi - %index.next = add i64 %index, 4 - %11 = icmp eq i64 %index.next, 1024 - br i1 %11, label %middle.block, label %vector.body - -middle.block: - %.lcssa = phi <16 x i32> [ %10, %vector.body ] - %rdx.shuf = shufflevector <16 x i32> %.lcssa, <16 x i32> undef, <16 x i32> - %bin.rdx = add <16 x i32> %.lcssa, %rdx.shuf - %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> - %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2 - %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> - %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3 - %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> - %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4 - %12 = extractelement <16 x i32> %bin.rdx4, i32 0 - ret i32 %12 -} - -define void @sbto16f32_256(<16 x i16> %a, <16 x float>* %res) "required-vector-width"="256" { -; CHECK-LABEL: sbto16f32_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vpmovw2m %ymm0, %k0 -; CHECK-NEXT: kshiftrw $8, %k0, %k1 -; CHECK-NEXT: vpmovm2d %k1, %ymm0 -; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 -; CHECK-NEXT: vpmovm2d %k0, %ymm1 -; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1 -; CHECK-NEXT: vmovaps %ymm1, (%rdi) -; CHECK-NEXT: vmovaps %ymm0, 32(%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq - %mask = icmp slt <16 x i16> %a, zeroinitializer - %1 = sitofp <16 x i1> %mask to <16 x float> - store <16 x float> %1, <16 x float>* %res - ret void -} - -define void @sbto16f32_512(<16 x i16> %a, <16 x float>* %res) "required-vector-width"="512" { -; CHECK-LABEL: sbto16f32_512: -; CHECK: # %bb.0: -; CHECK-NEXT: vpmovw2m %ymm0, %k0 -; CHECK-NEXT: vpmovm2d %k0, %zmm0 -; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0 -; CHECK-NEXT: vmovaps %zmm0, (%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq - %mask = icmp slt <16 x i16> %a, zeroinitializer - %1 = sitofp <16 x i1> %mask to <16 x float> - store <16 x float> %1, <16 x float>* %res - ret void -} - -define void @sbto16f64_256(<16 x i16> %a, <16 x double>* %res) "required-vector-width"="256" { -; CHECK-LABEL: sbto16f64_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vpmovw2m %ymm0, %k0 -; CHECK-NEXT: kshiftrw $8, %k0, %k1 -; CHECK-NEXT: vpmovm2d %k1, %ymm0 -; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0 -; CHECK-NEXT: vpmovm2d %k0, %ymm2 -; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm3 -; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2 -; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm2 -; CHECK-NEXT: vmovaps %ymm2, 32(%rdi) -; CHECK-NEXT: vmovaps %ymm3, (%rdi) -; CHECK-NEXT: vmovaps %ymm0, 96(%rdi) -; CHECK-NEXT: vmovaps %ymm1, 64(%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq - %mask = icmp slt <16 x i16> %a, zeroinitializer - %1 = sitofp <16 x i1> %mask to <16 x double> - store <16 x double> %1, <16 x double>* %res - ret void -} - -define void @sbto16f64_512(<16 x i16> %a, <16 x double>* %res) "required-vector-width"="512" { -; CHECK-LABEL: sbto16f64_512: -; CHECK: # %bb.0: -; CHECK-NEXT: vpmovw2m %ymm0, %k0 -; CHECK-NEXT: vpmovm2d %k0, %zmm0 -; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm1 -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0 -; CHECK-NEXT: vmovaps %zmm0, 64(%rdi) -; CHECK-NEXT: vmovaps %zmm1, (%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq - %mask = icmp slt <16 x i16> %a, zeroinitializer - %1 = sitofp <16 x i1> %mask to <16 x double> - store <16 x double> %1, <16 x double>* %res - ret void -} - -define void @ubto16f32_256(<16 x i16> %a, <16 x float>* %res) "required-vector-width"="256" { -; CHECK-LABEL: ubto16f32_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vpmovw2m %ymm0, %k0 -; CHECK-NEXT: kshiftrw $8, %k0, %k1 -; CHECK-NEXT: vpmovm2d %k1, %ymm0 -; CHECK-NEXT: vpsrld $31, %ymm0, %ymm0 -; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 -; CHECK-NEXT: vpmovm2d %k0, %ymm1 -; CHECK-NEXT: vpsrld $31, %ymm1, %ymm1 -; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1 -; CHECK-NEXT: vmovaps %ymm1, (%rdi) -; CHECK-NEXT: vmovaps %ymm0, 32(%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq - %mask = icmp slt <16 x i16> %a, zeroinitializer - %1 = uitofp <16 x i1> %mask to <16 x float> - store <16 x float> %1, <16 x float>* %res - ret void -} - -define void @ubto16f32_512(<16 x i16> %a, <16 x float>* %res) "required-vector-width"="512" { -; CHECK-LABEL: ubto16f32_512: -; CHECK: # %bb.0: -; CHECK-NEXT: vpmovw2m %ymm0, %k0 -; CHECK-NEXT: vpmovm2d %k0, %zmm0 -; CHECK-NEXT: vpsrld $31, %zmm0, %zmm0 -; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0 -; CHECK-NEXT: vmovaps %zmm0, (%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq - %mask = icmp slt <16 x i16> %a, zeroinitializer - %1 = uitofp <16 x i1> %mask to <16 x float> - store <16 x float> %1, <16 x float>* %res - ret void -} - -define void @ubto16f64_256(<16 x i16> %a, <16 x double>* %res) "required-vector-width"="256" { -; CHECK-LABEL: ubto16f64_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vpmovw2m %ymm0, %k0 -; CHECK-NEXT: kshiftrw $8, %k0, %k1 -; CHECK-NEXT: vpmovm2d %k1, %ymm0 -; CHECK-NEXT: vpsrld $31, %ymm0, %ymm0 -; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0 -; CHECK-NEXT: vpmovm2d %k0, %ymm2 -; CHECK-NEXT: vpsrld $31, %ymm2, %ymm2 -; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm3 -; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2 -; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm2 -; CHECK-NEXT: vmovaps %ymm2, 32(%rdi) -; CHECK-NEXT: vmovaps %ymm3, (%rdi) -; CHECK-NEXT: vmovaps %ymm0, 96(%rdi) -; CHECK-NEXT: vmovaps %ymm1, 64(%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq - %mask = icmp slt <16 x i16> %a, zeroinitializer - %1 = uitofp <16 x i1> %mask to <16 x double> - store <16 x double> %1, <16 x double>* %res - ret void -} - -define void @ubto16f64_512(<16 x i16> %a, <16 x double>* %res) "required-vector-width"="512" { -; CHECK-LABEL: ubto16f64_512: -; CHECK: # %bb.0: -; CHECK-NEXT: vpmovw2m %ymm0, %k0 -; CHECK-NEXT: vpmovm2d %k0, %zmm0 -; CHECK-NEXT: vpsrld $31, %zmm0, %zmm0 -; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm1 -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0 -; CHECK-NEXT: vmovaps %zmm0, 64(%rdi) -; CHECK-NEXT: vmovaps %zmm1, (%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq - %mask = icmp slt <16 x i16> %a, zeroinitializer - %1 = uitofp <16 x i1> %mask to <16 x double> - store <16 x double> %1, <16 x double>* %res - ret void -} - -define <16 x i16> @test_16f32toub_256(<16 x float>* %ptr, <16 x i16> %passthru) "required-vector-width"="256" { -; CHECK-LABEL: test_16f32toub_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vcvttps2dq (%rdi), %ymm1 -; CHECK-NEXT: vpmovdw %ymm1, %xmm1 -; CHECK-NEXT: vcvttps2dq 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovdw %ymm2, %xmm2 -; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; CHECK-NEXT: vpsllw $15, %ymm1, %ymm1 -; CHECK-NEXT: vpmovw2m %ymm1, %k1 -; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq - %a = load <16 x float>, <16 x float>* %ptr - %mask = fptoui <16 x float> %a to <16 x i1> - %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer - ret <16 x i16> %select -} - -define <16 x i16> @test_16f32toub_512(<16 x float>* %ptr, <16 x i16> %passthru) "required-vector-width"="512" { -; CHECK-LABEL: test_16f32toub_512: -; CHECK: # %bb.0: -; CHECK-NEXT: vcvttps2dq (%rdi), %zmm1 -; CHECK-NEXT: vpslld $31, %zmm1, %zmm1 -; CHECK-NEXT: vpmovd2m %zmm1, %k1 -; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq - %a = load <16 x float>, <16 x float>* %ptr - %mask = fptoui <16 x float> %a to <16 x i1> - %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer - ret <16 x i16> %select -} - -define <16 x i16> @test_16f32tosb_256(<16 x float>* %ptr, <16 x i16> %passthru) "required-vector-width"="256" { -; CHECK-LABEL: test_16f32tosb_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vcvttps2dq (%rdi), %ymm1 -; CHECK-NEXT: vpmovdw %ymm1, %xmm1 -; CHECK-NEXT: vcvttps2dq 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovdw %ymm2, %xmm2 -; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; CHECK-NEXT: vpsllw $15, %ymm1, %ymm1 -; CHECK-NEXT: vpmovw2m %ymm1, %k1 -; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq - %a = load <16 x float>, <16 x float>* %ptr - %mask = fptosi <16 x float> %a to <16 x i1> - %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer - ret <16 x i16> %select -} - -define <16 x i16> @test_16f32tosb_512(<16 x float>* %ptr, <16 x i16> %passthru) "required-vector-width"="512" { -; CHECK-LABEL: test_16f32tosb_512: -; CHECK: # %bb.0: -; CHECK-NEXT: vcvttps2dq (%rdi), %zmm1 -; CHECK-NEXT: vpmovd2m %zmm1, %k1 -; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq - %a = load <16 x float>, <16 x float>* %ptr - %mask = fptosi <16 x float> %a to <16 x i1> - %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer - ret <16 x i16> %select -}