Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -1144,12 +1144,10 @@ } } + // This block controls legalization of the mask vector sizes that are + // available with AVX512. 512-bit vectors are in a separate block controlled + // by useAVX512Regs. if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { - addRegisterClass(MVT::v16i32, &X86::VR512RegClass); - addRegisterClass(MVT::v16f32, &X86::VR512RegClass); - addRegisterClass(MVT::v8i64, &X86::VR512RegClass); - addRegisterClass(MVT::v8f64, &X86::VR512RegClass); - addRegisterClass(MVT::v1i1, &X86::VK1RegClass); addRegisterClass(MVT::v2i1, &X86::VK2RegClass); addRegisterClass(MVT::v4i1, &X86::VK4RegClass); @@ -1160,8 +1158,6 @@ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom); - setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32); - setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32); setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32); setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32); setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32); @@ -1200,6 +1196,16 @@ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom); for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 }) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + } + + // This block controls legalization for 512-bit operations with 32/64 bit + // elements. 512-bits can be disabled based on prefer-vector-width and + // required-vector-width function attributes. + if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) { + addRegisterClass(MVT::v16i32, &X86::VR512RegClass); + addRegisterClass(MVT::v16f32, &X86::VR512RegClass); + addRegisterClass(MVT::v8i64, &X86::VR512RegClass); + addRegisterClass(MVT::v8f64, &X86::VR512RegClass); for (MVT VT : MVT::fp_vector_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal); @@ -1222,7 +1228,9 @@ setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i16, MVT::v16i32); setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i8, MVT::v16i32); + setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32); setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); + setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32); setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i8, MVT::v16i32); setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i16, MVT::v16i32); setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); @@ -1352,6 +1360,9 @@ } }// has AVX-512 + // This block controls legalization for operations that don't have + // pre-AVX512 equivalents. Without VLX we use 512-bit operations for + // narrower widths. if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { // These operations are handled on non-VLX by artificially widening in // isel patterns. @@ -1406,10 +1417,10 @@ } } + // This block control legalization of v32i1/v64i1 which are available with + // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with + // useBWIRegs. if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) { - addRegisterClass(MVT::v32i16, &X86::VR512RegClass); - addRegisterClass(MVT::v64i8, &X86::VR512RegClass); - addRegisterClass(MVT::v32i1, &X86::VK32RegClass); addRegisterClass(MVT::v64i1, &X86::VK64RegClass); @@ -1439,6 +1450,15 @@ setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom); + } + + // This block controls legalization for v32i16 and v64i8. 512-bits can be + // disabled based on prefer-vector-width and required-vector-width function + // attributes. + if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) { + addRegisterClass(MVT::v32i16, &X86::VR512RegClass); + addRegisterClass(MVT::v64i8, &X86::VR512RegClass); + // Extends from v64i1 masks to 512-bit vectors. setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom); @@ -30049,7 +30069,7 @@ EVT VT = N->getValueType(0); if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) && - (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64))) + (!Subtarget.useAVX512Regs() || (VT != MVT::v16f32 && VT != MVT::v8f64))) return false; // We only handle target-independent shuffles. @@ -31086,7 +31106,7 @@ return SDValue(); unsigned RegSize = 128; - if (Subtarget.hasBWI()) + if (Subtarget.useBWIRegs()) RegSize = 512; else if (Subtarget.hasAVX2()) RegSize = 256; @@ -32664,7 +32684,7 @@ if (Subtarget.getProcFamily() != X86Subtarget::IntelKNL && ((VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasAVX2()) || - (VT == MVT::v16i32 && Subtarget.hasBWI()))) { + (VT == MVT::v16i32 && Subtarget.useBWIRegs()))) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); APInt Mask17 = APInt::getHighBitsSet(32, 17); @@ -34190,7 +34210,7 @@ SDValue Op1, F Builder) { assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2"); unsigned NumSubs = 1; - if (Subtarget.hasBWI()) { + if (Subtarget.useBWIRegs()) { if (VT.getSizeInBits() > 512) { NumSubs = VT.getSizeInBits() / 512; assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size"); @@ -36181,7 +36201,7 @@ // Also use this if we don't have SSE41 to allow the legalizer do its job. if (!Subtarget.hasSSE41() || VT.is128BitVector() || (VT.is256BitVector() && Subtarget.hasInt256()) || - (VT.is512BitVector() && Subtarget.hasAVX512())) { + (VT.is512BitVector() && Subtarget.useAVX512Regs())) { SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits()); return Opcode == ISD::SIGN_EXTEND ? DAG.getSignExtendVectorInReg(ExOp, DL, VT) @@ -36214,7 +36234,7 @@ // On pre-AVX512 targets, split into 256-bit nodes of // ISD::*_EXTEND_VECTOR_INREG. - if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256)) + if (!Subtarget.useAVX512Regs() && !(VT.getSizeInBits() % 256)) return SplitAndExtendInReg(256); return SDValue(); @@ -37169,7 +37189,7 @@ EVT VT = N->getValueType(0); unsigned RegSize = 128; - if (Subtarget.hasBWI()) + if (Subtarget.useBWIRegs()) RegSize = 512; else if (Subtarget.hasAVX2()) RegSize = 256; @@ -37214,7 +37234,7 @@ return SDValue(); unsigned RegSize = 128; - if (Subtarget.hasBWI()) + if (Subtarget.useBWIRegs()) RegSize = 512; else if (Subtarget.hasAVX2()) RegSize = 256; @@ -37442,8 +37462,8 @@ if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) && !(Subtarget.hasSSE41() && (VT == MVT::v8i32)) && !(Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)) && - !(Subtarget.hasBWI() && (VT == MVT::v64i8 || VT == MVT::v32i16 || - VT == MVT::v16i32 || VT == MVT::v8i64))) + !(Subtarget.useBWIRegs() && (VT == MVT::v64i8 || VT == MVT::v32i16 || + VT == MVT::v16i32 || VT == MVT::v8i64))) return SDValue(); SDValue SubusLHS, SubusRHS; Index: llvm/trunk/lib/Target/X86/X86Subtarget.h =================================================================== --- llvm/trunk/lib/Target/X86/X86Subtarget.h +++ llvm/trunk/lib/Target/X86/X86Subtarget.h @@ -407,6 +407,9 @@ /// features. unsigned PreferVectorWidth; + /// Required vector width from function attribute. + unsigned RequiredVectorWidth; + /// True if compiling for 64-bit, false for 16-bit or 32-bit. bool In64BitMode; @@ -433,7 +436,8 @@ /// X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS, const X86TargetMachine &TM, unsigned StackAlignOverride, - unsigned PreferVectorWidthOverride); + unsigned PreferVectorWidthOverride, + unsigned RequiredVectorWidth); const X86TargetLowering *getTargetLowering() const override { return &TLInfo; @@ -622,6 +626,7 @@ bool useRetpolineExternalThunk() const { return UseRetpolineExternalThunk; } unsigned getPreferVectorWidth() const { return PreferVectorWidth; } + unsigned getRequiredVectorWidth() const { return RequiredVectorWidth; } // Helper functions to determine when we should allow widening to 512-bit // during codegen. @@ -634,6 +639,16 @@ return hasBWI() && canExtendTo512DQ(); } + // If there are no 512-bit vectors and we prefer not to use 512-bit registers, + // disable them in the legalizer. + bool useAVX512Regs() const { + return hasAVX512() && (canExtendTo512DQ() || RequiredVectorWidth > 256); + } + + bool useBWIRegs() const { + return hasBWI() && useAVX512Regs(); + } + bool isXRaySupported() const override { return is64Bit(); } X86ProcFamilyEnum getProcFamily() const { return X86ProcFamily; } Index: llvm/trunk/lib/Target/X86/X86Subtarget.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86Subtarget.cpp +++ llvm/trunk/lib/Target/X86/X86Subtarget.cpp @@ -373,11 +373,13 @@ X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS, const X86TargetMachine &TM, unsigned StackAlignOverride, - unsigned PreferVectorWidthOverride) + unsigned PreferVectorWidthOverride, + unsigned RequiredVectorWidth) : X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others), PICStyle(PICStyles::None), TM(TM), TargetTriple(TT), StackAlignOverride(StackAlignOverride), PreferVectorWidthOverride(PreferVectorWidthOverride), + RequiredVectorWidth(RequiredVectorWidth), In64BitMode(TargetTriple.getArch() == Triple::x86_64), In32BitMode(TargetTriple.getArch() == Triple::x86 && TargetTriple.getEnvironment() != Triple::CODE16), Index: llvm/trunk/lib/Target/X86/X86TargetMachine.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86TargetMachine.cpp +++ llvm/trunk/lib/Target/X86/X86TargetMachine.cpp @@ -259,8 +259,7 @@ // the feature string out later. unsigned CPUFSWidth = Key.size(); - // Translate vector width function attribute into subtarget features. This - // overrides any CPU specific turning parameter + // Extract prefer-vector-width attribute. unsigned PreferVectorWidthOverride = 0; if (F.hasFnAttribute("prefer-vector-width")) { StringRef Val = F.getFnAttribute("prefer-vector-width").getValueAsString(); @@ -272,6 +271,21 @@ } } + // Extract required-vector-width attribute. + unsigned RequiredVectorWidth = UINT32_MAX; + if (F.hasFnAttribute("required-vector-width")) { + StringRef Val = F.getFnAttribute("required-vector-width").getValueAsString(); + unsigned Width; + if (!Val.getAsInteger(0, Width)) { + Key += ",required-vector-width="; + Key += Val; + RequiredVectorWidth = Width; + } + } + + // Extracted here so that we make sure there is backing for the StringRef. If + // we assigned earlier, its possible the SmallString reallocated leaving a + // dangling StringRef. FS = Key.slice(CPU.size(), CPUFSWidth); auto &I = SubtargetMap[Key]; @@ -282,7 +296,8 @@ resetTargetOptions(F); I = llvm::make_unique(TargetTriple, CPU, FS, *this, Options.StackAlignmentOverride, - PreferVectorWidthOverride); + PreferVectorWidthOverride, + RequiredVectorWidth); } return I.get(); } Index: llvm/trunk/test/CodeGen/X86/required-vector-width.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/required-vector-width.ll +++ llvm/trunk/test/CodeGen/X86/required-vector-width.ll @@ -0,0 +1,628 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw,avx512dq,prefer-256-bit | FileCheck %s + +; This file primarily contains tests for specific places in X86ISelLowering.cpp that needed be made aware of the legalizer not allowing 512-bit vectors due to prefer-256-bit even though AVX512 is enabled. + +define void @add256(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "required-vector-width"="256" { +; CHECK-LABEL: add256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-NEXT: vpaddd (%rsi), %ymm0, %ymm0 +; CHECK-NEXT: vpaddd 32(%rsi), %ymm1, %ymm1 +; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %d = load <16 x i32>, <16 x i32>* %a + %e = load <16 x i32>, <16 x i32>* %b + %f = add <16 x i32> %d, %e + store <16 x i32> %f, <16 x i32>* %c + ret void +} + +define void @add512(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "required-vector-width"="512" { +; CHECK-LABEL: add512: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpaddd (%rsi), %zmm0, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %d = load <16 x i32>, <16 x i32>* %a + %e = load <16 x i32>, <16 x i32>* %b + %f = add <16 x i32> %d, %e + store <16 x i32> %f, <16 x i32>* %c + ret void +} + +define void @avg_v64i8_256(<64 x i8>* %a, <64 x i8>* %b) "required-vector-width"="256" { +; CHECK-LABEL: avg_v64i8_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa (%rsi), %ymm0 +; CHECK-NEXT: vmovdqa 32(%rsi), %ymm1 +; CHECK-NEXT: vpavgb (%rdi), %ymm0, %ymm0 +; CHECK-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1 +; CHECK-NEXT: vmovdqu %ymm1, (%rax) +; CHECK-NEXT: vmovdqu %ymm0, (%rax) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %1 = load <64 x i8>, <64 x i8>* %a + %2 = load <64 x i8>, <64 x i8>* %b + %3 = zext <64 x i8> %1 to <64 x i32> + %4 = zext <64 x i8> %2 to <64 x i32> + %5 = add nuw nsw <64 x i32> %3, + %6 = add nuw nsw <64 x i32> %5, %4 + %7 = lshr <64 x i32> %6, + %8 = trunc <64 x i32> %7 to <64 x i8> + store <64 x i8> %8, <64 x i8>* undef, align 4 + ret void +} + + +define void @avg_v64i8_512(<64 x i8>* %a, <64 x i8>* %b) "required-vector-width"="512" { +; CHECK-LABEL: avg_v64i8_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm0 +; CHECK-NEXT: vpavgb (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: vmovdqu64 %zmm0, (%rax) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %1 = load <64 x i8>, <64 x i8>* %a + %2 = load <64 x i8>, <64 x i8>* %b + %3 = zext <64 x i8> %1 to <64 x i32> + %4 = zext <64 x i8> %2 to <64 x i32> + %5 = add nuw nsw <64 x i32> %3, + %6 = add nuw nsw <64 x i32> %5, %4 + %7 = lshr <64 x i32> %6, + %8 = trunc <64 x i32> %7 to <64 x i8> + store <64 x i8> %8, <64 x i8>* undef, align 4 + ret void +} + +define void @pmaddwd_32_256(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "required-vector-width"="256" { +; CHECK-LABEL: pmaddwd_32_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 +; CHECK-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 +; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %A = load <32 x i16>, <32 x i16>* %APtr + %B = load <32 x i16>, <32 x i16>* %BPtr + %a = sext <32 x i16> %A to <32 x i32> + %b = sext <32 x i16> %B to <32 x i32> + %m = mul nsw <32 x i32> %a, %b + %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> + %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> + %ret = add <16 x i32> %odd, %even + store <16 x i32> %ret, <16 x i32>* %CPtr + ret void +} + +define void @pmaddwd_32_512(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "required-vector-width"="512" { +; CHECK-LABEL: pmaddwd_32_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpmaddwd (%rsi), %zmm0, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %A = load <32 x i16>, <32 x i16>* %APtr + %B = load <32 x i16>, <32 x i16>* %BPtr + %a = sext <32 x i16> %A to <32 x i32> + %b = sext <32 x i16> %B to <32 x i32> + %m = mul nsw <32 x i32> %a, %b + %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> + %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> + %ret = add <16 x i32> %odd, %even + store <16 x i32> %ret, <16 x i32>* %CPtr + ret void +} + +define void @psubus_64i8_max_256(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) "required-vector-width"="256" { +; CHECK-LABEL: psubus_64i8_max_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-NEXT: vpsubusb (%rsi), %ymm0, %ymm0 +; CHECK-NEXT: vpsubusb 32(%rsi), %ymm1, %ymm1 +; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %x = load <64 x i8>, <64 x i8>* %xptr + %y = load <64 x i8>, <64 x i8>* %yptr + %cmp = icmp ult <64 x i8> %x, %y + %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x + %res = sub <64 x i8> %max, %y + store <64 x i8> %res, <64 x i8>* %zptr + ret void +} + +define void @psubus_64i8_max_512(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) "required-vector-width"="512" { +; CHECK-LABEL: psubus_64i8_max_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpsubusb (%rsi), %zmm0, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %x = load <64 x i8>, <64 x i8>* %xptr + %y = load <64 x i8>, <64 x i8>* %yptr + %cmp = icmp ult <64 x i8> %x, %y + %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x + %res = sub <64 x i8> %max, %y + store <64 x i8> %res, <64 x i8>* %zptr + ret void +} + +define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly, i32) "required-vector-width"="256" { +; CHECK-LABEL: _Z9test_charPcS_i_256: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB8_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vpmovsxbw (%rdi,%rcx), %xmm4 +; CHECK-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm5 +; CHECK-NEXT: vpmovsxbw 16(%rdi,%rcx), %xmm6 +; CHECK-NEXT: vpmovsxbw 24(%rdi,%rcx), %xmm8 +; CHECK-NEXT: vpmovsxbw (%rsi,%rcx), %xmm7 +; CHECK-NEXT: vpmaddwd %xmm4, %xmm7, %xmm4 +; CHECK-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm7 +; CHECK-NEXT: vpmaddwd %xmm5, %xmm7, %xmm5 +; CHECK-NEXT: vpmovsxbw 16(%rsi,%rcx), %xmm7 +; CHECK-NEXT: vpmaddwd %xmm6, %xmm7, %xmm6 +; CHECK-NEXT: vpmovsxbw 24(%rsi,%rcx), %xmm7 +; CHECK-NEXT: vpmaddwd %xmm8, %xmm7, %xmm7 +; CHECK-NEXT: vpaddd %ymm3, %ymm7, %ymm3 +; CHECK-NEXT: vpaddd %ymm2, %ymm6, %ymm2 +; CHECK-NEXT: vpaddd %ymm1, %ymm5, %ymm1 +; CHECK-NEXT: vpaddd %ymm0, %ymm4, %ymm0 +; CHECK-NEXT: addq $32, %rcx +; CHECK-NEXT: cmpq %rcx, %rax +; CHECK-NEXT: jne .LBB8_1 +; CHECK-NEXT: # %bb.2: # %middle.block +; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; CHECK-NEXT: vpaddd %ymm3, %ymm1, %ymm1 +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %3 = zext i32 %2 to i64 + br label %vector.body + +vector.body: + %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] + %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] + %4 = getelementptr inbounds i8, i8* %0, i64 %index + %5 = bitcast i8* %4 to <32 x i8>* + %wide.load = load <32 x i8>, <32 x i8>* %5, align 1 + %6 = sext <32 x i8> %wide.load to <32 x i32> + %7 = getelementptr inbounds i8, i8* %1, i64 %index + %8 = bitcast i8* %7 to <32 x i8>* + %wide.load14 = load <32 x i8>, <32 x i8>* %8, align 1 + %9 = sext <32 x i8> %wide.load14 to <32 x i32> + %10 = mul nsw <32 x i32> %9, %6 + %11 = add nsw <32 x i32> %10, %vec.phi + %index.next = add i64 %index, 32 + %12 = icmp eq i64 %index.next, %3 + br i1 %12, label %middle.block, label %vector.body + +middle.block: + %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> + %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1 + %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> + %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf + %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> + %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15 + %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> + %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17 + %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> + %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19 + %13 = extractelement <32 x i32> %bin.rdx20, i32 0 + ret i32 %13 +} + +define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly, i32) "required-vector-width"="512" { +; CHECK-LABEL: _Z9test_charPcS_i_512: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB9_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2 +; CHECK-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3 +; CHECK-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2 +; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: addq $32, %rcx +; CHECK-NEXT: cmpq %rcx, %rax +; CHECK-NEXT: jne .LBB9_1 +; CHECK-NEXT: # %bb.2: # %middle.block +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %3 = zext i32 %2 to i64 + br label %vector.body + +vector.body: + %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] + %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] + %4 = getelementptr inbounds i8, i8* %0, i64 %index + %5 = bitcast i8* %4 to <32 x i8>* + %wide.load = load <32 x i8>, <32 x i8>* %5, align 1 + %6 = sext <32 x i8> %wide.load to <32 x i32> + %7 = getelementptr inbounds i8, i8* %1, i64 %index + %8 = bitcast i8* %7 to <32 x i8>* + %wide.load14 = load <32 x i8>, <32 x i8>* %8, align 1 + %9 = sext <32 x i8> %wide.load14 to <32 x i32> + %10 = mul nsw <32 x i32> %9, %6 + %11 = add nsw <32 x i32> %10, %vec.phi + %index.next = add i64 %index, 32 + %12 = icmp eq i64 %index.next, %3 + br i1 %12, label %middle.block, label %vector.body + +middle.block: + %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> + %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1 + %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> + %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf + %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> + %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15 + %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> + %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17 + %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> + %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19 + %13 = extractelement <32 x i32> %bin.rdx20, i32 0 + ret i32 %13 +} + +@a = global [1024 x i8] zeroinitializer, align 16 +@b = global [1024 x i8] zeroinitializer, align 16 + +define i32 @sad_16i8_256() "required-vector-width"="256" { +; CHECK-LABEL: sad_16i8_256: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: movq $-1024, %rax # imm = 0xFC00 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB10_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vmovdqu a+1024(%rax), %xmm2 +; CHECK-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2 +; CHECK-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: jne .LBB10_1 +; CHECK-NEXT: # %bb.2: # %middle.block +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] + %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index + %1 = bitcast i8* %0 to <16 x i8>* + %wide.load = load <16 x i8>, <16 x i8>* %1, align 4 + %2 = zext <16 x i8> %wide.load to <16 x i32> + %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index + %4 = bitcast i8* %3 to <16 x i8>* + %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4 + %5 = zext <16 x i8> %wide.load1 to <16 x i32> + %6 = sub nsw <16 x i32> %2, %5 + %7 = icmp sgt <16 x i32> %6, + %8 = sub nsw <16 x i32> zeroinitializer, %6 + %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8 + %10 = add nsw <16 x i32> %9, %vec.phi + %index.next = add i64 %index, 4 + %11 = icmp eq i64 %index.next, 1024 + br i1 %11, label %middle.block, label %vector.body + +middle.block: + %.lcssa = phi <16 x i32> [ %10, %vector.body ] + %rdx.shuf = shufflevector <16 x i32> %.lcssa, <16 x i32> undef, <16 x i32> + %bin.rdx = add <16 x i32> %.lcssa, %rdx.shuf + %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> + %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2 + %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> + %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3 + %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> + %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4 + %12 = extractelement <16 x i32> %bin.rdx4, i32 0 + ret i32 %12 +} + +define i32 @sad_16i8_512() "required-vector-width"="512" { +; CHECK-LABEL: sad_16i8_512: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: movq $-1024, %rax # imm = 0xFC00 +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB11_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vmovdqu a+1024(%rax), %xmm1 +; CHECK-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1 +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: jne .LBB11_1 +; CHECK-NEXT: # %bb.2: # %middle.block +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] + %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index + %1 = bitcast i8* %0 to <16 x i8>* + %wide.load = load <16 x i8>, <16 x i8>* %1, align 4 + %2 = zext <16 x i8> %wide.load to <16 x i32> + %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index + %4 = bitcast i8* %3 to <16 x i8>* + %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4 + %5 = zext <16 x i8> %wide.load1 to <16 x i32> + %6 = sub nsw <16 x i32> %2, %5 + %7 = icmp sgt <16 x i32> %6, + %8 = sub nsw <16 x i32> zeroinitializer, %6 + %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8 + %10 = add nsw <16 x i32> %9, %vec.phi + %index.next = add i64 %index, 4 + %11 = icmp eq i64 %index.next, 1024 + br i1 %11, label %middle.block, label %vector.body + +middle.block: + %.lcssa = phi <16 x i32> [ %10, %vector.body ] + %rdx.shuf = shufflevector <16 x i32> %.lcssa, <16 x i32> undef, <16 x i32> + %bin.rdx = add <16 x i32> %.lcssa, %rdx.shuf + %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> + %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2 + %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> + %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3 + %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> + %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4 + %12 = extractelement <16 x i32> %bin.rdx4, i32 0 + ret i32 %12 +} + +define <16 x float> @sbto16f32_256(<16 x i32> %a) "required-vector-width"="256" { +; CHECK-LABEL: sbto16f32_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 +; CHECK-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm1 +; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1 +; CHECK-NEXT: retq + %mask = icmp slt <16 x i32> %a, zeroinitializer + %1 = sitofp <16 x i1> %mask to <16 x float> + ret <16 x float> %1 +} + +define <16 x float> @sbto16f32_512(<16 x i32> %a) "required-vector-width"="512" { +; CHECK-LABEL: sbto16f32_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovd2m %zmm0, %k0 +; CHECK-NEXT: vpmovm2d %k0, %zmm0 +; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0 +; CHECK-NEXT: retq + %mask = icmp slt <16 x i32> %a, zeroinitializer + %1 = sitofp <16 x i1> %mask to <16 x float> + ret <16 x float> %1 +} + +define <16 x double> @sbto16f64_256(<16 x double> %a) "required-vector-width"="256" { +; CHECK-LABEL: sbto16f64_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpltpd %ymm2, %ymm4, %k0 +; CHECK-NEXT: vcmpltpd %ymm3, %ymm4, %k1 +; CHECK-NEXT: kshiftlb $4, %k1, %k1 +; CHECK-NEXT: korb %k1, %k0, %k0 +; CHECK-NEXT: vcmpltpd %ymm0, %ymm4, %k1 +; CHECK-NEXT: vcmpltpd %ymm1, %ymm4, %k2 +; CHECK-NEXT: kshiftlb $4, %k2, %k2 +; CHECK-NEXT: korb %k2, %k1, %k1 +; CHECK-NEXT: vpmovm2d %k1, %ymm1 +; CHECK-NEXT: vcvtdq2pd %xmm1, %ymm0 +; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 +; CHECK-NEXT: vcvtdq2pd %xmm1, %ymm1 +; CHECK-NEXT: vpmovm2d %k0, %ymm3 +; CHECK-NEXT: vcvtdq2pd %xmm3, %ymm2 +; CHECK-NEXT: vextracti128 $1, %ymm3, %xmm3 +; CHECK-NEXT: vcvtdq2pd %xmm3, %ymm3 +; CHECK-NEXT: retq + %cmpres = fcmp ogt <16 x double> %a, zeroinitializer + %1 = sitofp <16 x i1> %cmpres to <16 x double> + ret <16 x double> %1 +} + +define <16 x double> @sbto16f64_512(<16 x double> %a) "required-vector-width"="512" { +; CHECK-LABEL: sbto16f64_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpltpd %zmm0, %zmm2, %k0 +; CHECK-NEXT: vcmpltpd %zmm1, %zmm2, %k1 +; CHECK-NEXT: kunpckbw %k0, %k1, %k0 +; CHECK-NEXT: vpmovm2d %k0, %zmm1 +; CHECK-NEXT: vcvtdq2pd %ymm1, %zmm0 +; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; CHECK-NEXT: vcvtdq2pd %ymm1, %zmm1 +; CHECK-NEXT: retq + %cmpres = fcmp ogt <16 x double> %a, zeroinitializer + %1 = sitofp <16 x i1> %cmpres to <16 x double> + ret <16 x double> %1 +} + +define <16 x float> @ubto16f32_256(<16 x i32> %a) "required-vector-width"="256" { +; CHECK-LABEL: ubto16f32_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpand %ymm3, %ymm0, %ymm0 +; CHECK-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm1 +; CHECK-NEXT: vpand %ymm3, %ymm1, %ymm1 +; CHECK-NEXT: retq + %mask = icmp slt <16 x i32> %a, zeroinitializer + %1 = uitofp <16 x i1> %mask to <16 x float> + ret <16 x float> %1 +} + +define <16 x float> @ubto16f32_512(<16 x i32> %a) "required-vector-width"="512" { +; CHECK-LABEL: ubto16f32_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovd2m %zmm0, %k0 +; CHECK-NEXT: vpmovm2d %k0, %zmm0 +; CHECK-NEXT: vpsrld $31, %zmm0, %zmm0 +; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0 +; CHECK-NEXT: retq + %mask = icmp slt <16 x i32> %a, zeroinitializer + %1 = uitofp <16 x i1> %mask to <16 x float> + ret <16 x float> %1 +} + +define <16 x double> @ubto16f64_256(<16 x i32> %a) "required-vector-width"="256" { +; CHECK-LABEL: ubto16f64_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: vpsrld $31, %ymm0, %ymm3 +; CHECK-NEXT: vcvtdq2pd %xmm3, %ymm0 +; CHECK-NEXT: vextracti128 $1, %ymm3, %xmm3 +; CHECK-NEXT: vcvtdq2pd %xmm3, %ymm4 +; CHECK-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm1 +; CHECK-NEXT: vpsrld $31, %ymm1, %ymm1 +; CHECK-NEXT: vcvtdq2pd %xmm1, %ymm2 +; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 +; CHECK-NEXT: vcvtdq2pd %xmm1, %ymm3 +; CHECK-NEXT: vmovaps %ymm4, %ymm1 +; CHECK-NEXT: retq + %mask = icmp slt <16 x i32> %a, zeroinitializer + %1 = uitofp <16 x i1> %mask to <16 x double> + ret <16 x double> %1 +} + +define <16 x double> @ubto16f64_512(<16 x i32> %a) "required-vector-width"="512" { +; CHECK-LABEL: ubto16f64_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovd2m %zmm0, %k0 +; CHECK-NEXT: vpmovm2d %k0, %zmm0 +; CHECK-NEXT: vpsrld $31, %zmm0, %zmm1 +; CHECK-NEXT: vcvtdq2pd %ymm1, %zmm0 +; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; CHECK-NEXT: vcvtdq2pd %ymm1, %zmm1 +; CHECK-NEXT: retq + %mask = icmp slt <16 x i32> %a, zeroinitializer + %1 = uitofp <16 x i1> %mask to <16 x double> + ret <16 x double> %1 +} + +define <16 x i32> @test_16f32toub_256(<16 x float> %a, <16 x i32> %passthru) "required-vector-width"="256" { +; CHECK-LABEL: test_16f32toub_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 +; CHECK-NEXT: vpmovdw %ymm0, %xmm0 +; CHECK-NEXT: vcvttps2dq %ymm1, %ymm1 +; CHECK-NEXT: vpmovdw %ymm1, %xmm1 +; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vpsllw $15, %ymm0, %ymm0 +; CHECK-NEXT: vpmovw2m %ymm0, %k1 +; CHECK-NEXT: vmovdqa32 %ymm2, %ymm0 {%k1} {z} +; CHECK-NEXT: kshiftrw $8, %k1, %k1 +; CHECK-NEXT: vmovdqa32 %ymm3, %ymm1 {%k1} {z} +; CHECK-NEXT: retq + %mask = fptoui <16 x float> %a to <16 x i1> + %select = select <16 x i1> %mask, <16 x i32> %passthru, <16 x i32> zeroinitializer + ret <16 x i32> %select +} + +define <16 x i32> @test_16f32toub_512(<16 x float> %a, <16 x i32> %passthru) "required-vector-width"="512" { +; CHECK-LABEL: test_16f32toub_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2dq %zmm0, %zmm0 +; CHECK-NEXT: vpslld $31, %zmm0, %zmm0 +; CHECK-NEXT: vptestmd %zmm0, %zmm0, %k1 +; CHECK-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %mask = fptoui <16 x float> %a to <16 x i1> + %select = select <16 x i1> %mask, <16 x i32> %passthru, <16 x i32> zeroinitializer + ret <16 x i32> %select +} + +define <16 x i32> @test_16f32tosb_256(<16 x float> %a, <16 x i32> %passthru) "required-vector-width"="256" { +; CHECK-LABEL: test_16f32tosb_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 +; CHECK-NEXT: vpmovdw %ymm0, %xmm0 +; CHECK-NEXT: vcvttps2dq %ymm1, %ymm1 +; CHECK-NEXT: vpmovdw %ymm1, %xmm1 +; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vpsllw $15, %ymm0, %ymm0 +; CHECK-NEXT: vpmovw2m %ymm0, %k1 +; CHECK-NEXT: vmovdqa32 %ymm2, %ymm0 {%k1} {z} +; CHECK-NEXT: kshiftrw $8, %k1, %k1 +; CHECK-NEXT: vmovdqa32 %ymm3, %ymm1 {%k1} {z} +; CHECK-NEXT: retq + %mask = fptosi <16 x float> %a to <16 x i1> + %select = select <16 x i1> %mask, <16 x i32> %passthru, <16 x i32> zeroinitializer + ret <16 x i32> %select +} + +define <16 x i32> @test_16f32tosb_512(<16 x float> %a, <16 x i32> %passthru) "required-vector-width"="512" { +; CHECK-LABEL: test_16f32tosb_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2dq %zmm0, %zmm0 +; CHECK-NEXT: vptestmd %zmm0, %zmm0, %k1 +; CHECK-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %mask = fptosi <16 x float> %a to <16 x i1> + %select = select <16 x i1> %mask, <16 x i32> %passthru, <16 x i32> zeroinitializer + ret <16 x i32> %select +}