Index: lib/Target/X86/CMakeLists.txt =================================================================== --- lib/Target/X86/CMakeLists.txt +++ lib/Target/X86/CMakeLists.txt @@ -58,6 +58,7 @@ X86TargetObjectFile.cpp X86TargetTransformInfo.cpp X86VZeroUpper.cpp + X86VectorWidthInfer.cpp X86WinAllocaExpander.cpp X86WinEHState.cpp ) Index: lib/Target/X86/X86.h =================================================================== --- lib/Target/X86/X86.h +++ lib/Target/X86/X86.h @@ -116,6 +116,12 @@ void initializeEvexToVexInstPassPass(PassRegistry &); +/// This pass tries to infer a required vector width for a function if the +/// required-vector-width attribute isn't present. +FunctionPass *createX86VectorWidthInferPass(); + +void initializeX86VectorWidthInferPass(PassRegistry &); + } // End llvm namespace #endif Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1158,7 +1158,7 @@ // This block controls legalization of the mask vector sizes that are // available with AVX512. 512-bit vectors are in a separate block controlled - // by useAVX512Regs. + // by canExtendTo512DQ. if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { addRegisterClass(MVT::v1i1, &X86::VK1RegClass); addRegisterClass(MVT::v2i1, &X86::VK2RegClass); @@ -1224,9 +1224,9 @@ } // This block controls legalization for 512-bit operations with 32/64 bit - // elements. 512-bits can be disabled based on prefer-vector-width and - // required-vector-width function attributes. - if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) { + // elements. 512-bits can be disabled based on prefer-vector-width + // function attribute. + if (!Subtarget.useSoftFloat() && Subtarget.canExtendTo512DQ()) { addRegisterClass(MVT::v16i32, &X86::VR512RegClass); addRegisterClass(MVT::v16f32, &X86::VR512RegClass); addRegisterClass(MVT::v8i64, &X86::VR512RegClass); @@ -1450,7 +1450,7 @@ // This block control legalization of v32i1/v64i1 which are available with // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with - // useBWIRegs. + // canExtendTo512BW. if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) { addRegisterClass(MVT::v32i1, &X86::VK32RegClass); addRegisterClass(MVT::v64i1, &X86::VK64RegClass); @@ -1484,9 +1484,8 @@ } // This block controls legalization for v32i16 and v64i8. 512-bits can be - // disabled based on prefer-vector-width and required-vector-width function - // attributes. - if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) { + // disabled based on prefer-vector-width function attribute. + if (!Subtarget.useSoftFloat() && Subtarget.canExtendTo512BW()) { addRegisterClass(MVT::v32i16, &X86::VR512RegClass); addRegisterClass(MVT::v64i8, &X86::VR512RegClass); @@ -5120,7 +5119,7 @@ F Builder) { assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2"); unsigned NumSubs = 1; - if (Subtarget.useBWIRegs()) { + if (Subtarget.canExtendTo512BW()) { if (VT.getSizeInBits() > 512) { NumSubs = VT.getSizeInBits() / 512; assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size"); @@ -31510,7 +31509,7 @@ return SDValue(); unsigned RegSize = 128; - if (Subtarget.useBWIRegs()) + if (Subtarget.canExtendTo512BW()) RegSize = 512; else if (Subtarget.hasAVX()) RegSize = 256; @@ -36770,7 +36769,7 @@ // Also use this if we don't have SSE41 to allow the legalizer do its job. if (!Subtarget.hasSSE41() || VT.is128BitVector() || (VT.is256BitVector() && Subtarget.hasInt256()) || - (VT.is512BitVector() && Subtarget.useAVX512Regs())) { + (VT.is512BitVector() && Subtarget.canExtendTo512DQ())) { SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits()); return Opcode == ISD::SIGN_EXTEND ? DAG.getSignExtendVectorInReg(ExOp, DL, VT) @@ -36803,7 +36802,7 @@ // On pre-AVX512 targets, split into 256-bit nodes of // ISD::*_EXTEND_VECTOR_INREG. - if (!Subtarget.useAVX512Regs() && !(VT.getSizeInBits() % 256)) + if (!Subtarget.canExtendTo512DQ() && !(VT.getSizeInBits() % 256)) return SplitAndExtendInReg(256); return SDValue(); @@ -37771,7 +37770,7 @@ EVT VT = N->getValueType(0); unsigned RegSize = 128; - if (Subtarget.useBWIRegs()) + if (Subtarget.canExtendTo512BW()) RegSize = 512; else if (Subtarget.hasAVX()) RegSize = 256; @@ -37822,7 +37821,7 @@ return SDValue(); unsigned RegSize = 128; - if (Subtarget.useBWIRegs()) + if (Subtarget.canExtendTo512BW()) RegSize = 512; else if (Subtarget.hasAVX()) RegSize = 256; @@ -38051,8 +38050,10 @@ if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) && !(Subtarget.hasSSSE3() && (VT == MVT::v8i32 || VT == MVT::v8i64)) && !(Subtarget.hasAVX() && (VT == MVT::v32i8 || VT == MVT::v16i16)) && - !(Subtarget.useBWIRegs() && (VT == MVT::v64i8 || VT == MVT::v32i16 || - VT == MVT::v16i32 || VT == MVT::v8i64))) + !(Subtarget.canExtendTo512BW() && (VT == MVT::v64i8 || + VT == MVT::v32i16 || + VT == MVT::v16i32 || + VT == MVT::v8i64))) return SDValue(); SDValue SubusLHS, SubusRHS; Index: lib/Target/X86/X86Subtarget.h =================================================================== --- lib/Target/X86/X86Subtarget.h +++ lib/Target/X86/X86Subtarget.h @@ -407,9 +407,6 @@ /// features. unsigned PreferVectorWidth; - /// Required vector width from function attribute. - unsigned RequiredVectorWidth; - /// True if compiling for 64-bit, false for 16-bit or 32-bit. bool In64BitMode; @@ -436,8 +433,7 @@ /// X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS, const X86TargetMachine &TM, unsigned StackAlignOverride, - unsigned PreferVectorWidthOverride, - unsigned RequiredVectorWidth); + unsigned PreferVectorWidthOverride); const X86TargetLowering *getTargetLowering() const override { return &TLInfo; @@ -625,7 +621,6 @@ bool useRetpolineExternalThunk() const { return UseRetpolineExternalThunk; } unsigned getPreferVectorWidth() const { return PreferVectorWidth; } - unsigned getRequiredVectorWidth() const { return RequiredVectorWidth; } // Helper functions to determine when we should allow widening to 512-bit // during codegen. @@ -638,16 +633,6 @@ return hasBWI() && canExtendTo512DQ(); } - // If there are no 512-bit vectors and we prefer not to use 512-bit registers, - // disable them in the legalizer. - bool useAVX512Regs() const { - return hasAVX512() && (canExtendTo512DQ() || RequiredVectorWidth > 256); - } - - bool useBWIRegs() const { - return hasBWI() && useAVX512Regs(); - } - bool isXRaySupported() const override { return is64Bit(); } X86ProcFamilyEnum getProcFamily() const { return X86ProcFamily; } Index: lib/Target/X86/X86Subtarget.cpp =================================================================== --- lib/Target/X86/X86Subtarget.cpp +++ lib/Target/X86/X86Subtarget.cpp @@ -376,13 +376,11 @@ X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS, const X86TargetMachine &TM, unsigned StackAlignOverride, - unsigned PreferVectorWidthOverride, - unsigned RequiredVectorWidth) + unsigned PreferVectorWidthOverride) : X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others), PICStyle(PICStyles::None), TM(TM), TargetTriple(TT), StackAlignOverride(StackAlignOverride), PreferVectorWidthOverride(PreferVectorWidthOverride), - RequiredVectorWidth(RequiredVectorWidth), In64BitMode(TargetTriple.getArch() == Triple::x86_64), In32BitMode(TargetTriple.getArch() == Triple::x86 && TargetTriple.getEnvironment() != Triple::CODE16), Index: lib/Target/X86/X86TargetMachine.cpp =================================================================== --- lib/Target/X86/X86TargetMachine.cpp +++ lib/Target/X86/X86TargetMachine.cpp @@ -80,6 +80,7 @@ initializeX86CmovConverterPassPass(PR); initializeX86ExecutionDomainFixPass(PR); initializeX86DomainReassignmentPass(PR); + initializeX86VectorWidthInferPass(PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -271,18 +272,6 @@ } } - // Extract required-vector-width attribute. - unsigned RequiredVectorWidth = UINT32_MAX; - if (F.hasFnAttribute("required-vector-width")) { - StringRef Val = F.getFnAttribute("required-vector-width").getValueAsString(); - unsigned Width; - if (!Val.getAsInteger(0, Width)) { - Key += ",required-vector-width="; - Key += Val; - RequiredVectorWidth = Width; - } - } - // Extracted here so that we make sure there is backing for the StringRef. If // we assigned earlier, its possible the SmallString reallocated leaving a // dangling StringRef. @@ -296,8 +285,7 @@ resetTargetOptions(F); I = llvm::make_unique(TargetTriple, CPU, FS, *this, Options.StackAlignmentOverride, - PreferVectorWidthOverride, - RequiredVectorWidth); + PreferVectorWidthOverride); } return I.get(); } @@ -436,6 +424,8 @@ } bool X86PassConfig::addPreISel() { + addPass(createX86VectorWidthInferPass()); + // Only add this pass for 32-bit x86 Windows. const Triple &TT = TM->getTargetTriple(); if (TT.isOSWindows() && TT.getArch() == Triple::x86) Index: lib/Target/X86/X86VectorWidthInfer.cpp =================================================================== --- /dev/null +++ lib/Target/X86/X86VectorWidthInfer.cpp @@ -0,0 +1,122 @@ +//===- X86VectorWidthInfer.cpp - Infer required-vector-width attribute ----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file This pass tries to infer the required vector with for a function +/// if the required-vector-width attribute isn't present. +// ===---------------------------------------------------------------------===// + +#include "X86TargetMachine.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Pass.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86-vector-width-fix" + +namespace { + +class X86VectorWidthInfer : public FunctionPass { +public: + static char ID; // Pass ID + + X86VectorWidthInfer() : FunctionPass(ID) { + initializeX86VectorWidthInferPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + } + + bool runOnFunction(Function &F) override; +}; + +} // end anonymous namespace + +char X86VectorWidthInfer::ID = 0; + +INITIALIZE_PASS_BEGIN(X86VectorWidthInfer, DEBUG_TYPE, + "X86 Infer Vector Width", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END(X86VectorWidthInfer, DEBUG_TYPE, + "X86 Infer Vector Width", false, false) + +FunctionPass *llvm::createX86VectorWidthInferPass() { + return new X86VectorWidthInfer(); +} + +bool X86VectorWidthInfer::runOnFunction(Function &F) { + TargetPassConfig &TPC = getAnalysis(); + const X86Subtarget *ST = + TPC.getTM().getSubtargetImpl(F); + + // If the target doesn't support 512-bit vectors or doesn't prefer them, + // then there is nothing to do. + // TODO: Support this for 256 vs 128 as well? + if (!ST->hasAVX512() || ST->canExtendTo512DQ()) + return false; + + unsigned RequiredWidth = 0; + + // If we already have a function attribute make sure we keep at least its + // value. + if (F.hasFnAttribute("prefer-vector-width")) { + StringRef Val = + F.getFnAttribute("prefer-vector-width").getValueAsString(); + unsigned Width; + if (!Val.getAsInteger(0, Width)) { + RequiredWidth = Width; + } + } + + // Check for a vector return type. + Type *RetTy = F.getReturnType(); + if (RetTy->isVectorTy()) + RequiredWidth = std::max(RequiredWidth, RetTy->getPrimitiveSizeInBits()); + + // Check for any vector arguments. + for (const auto &A : F.args()) { + Type *ArgTy = A.getType(); + if (ArgTy->isVectorTy()) + RequiredWidth = std::max(RequiredWidth, ArgTy->getPrimitiveSizeInBits()); + } + + // Otherwise scan for any calls that need wide registers to match ABI. + // Also need this for any target specific intrinsics. + for (auto &BB : F) { + for (auto &I : BB) { + if (auto *CI = dyn_cast(&I)) { + // We can handle target independent intrinsics via type legalization so + // skip those. + if (auto *II = dyn_cast(&I)) { + StringRef Name = II->getCalledFunction()->getName(); + if (!Name.startswith("llvm.x86.")) + continue; + } + // Ok we have a call. Check its types. + Type *RetTy = CI->getType(); + if (RetTy->isVectorTy()) + RequiredWidth = std::max(RequiredWidth, + RetTy->getPrimitiveSizeInBits()); + for (Value *A : CI->arg_operands()) { + Type *ArgTy = A->getType(); + if (ArgTy->isVectorTy()) + RequiredWidth = std::max(RequiredWidth, + ArgTy->getPrimitiveSizeInBits()); + } + } + } + } + + // Remove and replace function's prefer-vector-width attribute. + RequiredWidth = PowerOf2Ceil(RequiredWidth); + F.removeFnAttr("prefer-vector-width"); + F.addFnAttr("prefer-vector-width", utostr(RequiredWidth)); + + return false; +} Index: test/CodeGen/X86/O0-pipeline.ll =================================================================== --- test/CodeGen/X86/O0-pipeline.ll +++ test/CodeGen/X86/O0-pipeline.ll @@ -30,6 +30,7 @@ ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Exception handling preparation +; CHECK-NEXT: X86 Infer Vector Width ; CHECK-NEXT: Safe Stack instrumentation pass ; CHECK-NEXT: Insert stack protectors ; CHECK-NEXT: Module Verifier Index: test/CodeGen/X86/O3-pipeline.ll =================================================================== --- test/CodeGen/X86/O3-pipeline.ll +++ test/CodeGen/X86/O3-pipeline.ll @@ -50,6 +50,7 @@ ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Exception handling preparation +; CHECK-NEXT: X86 Infer Vector Width ; CHECK-NEXT: Safe Stack instrumentation pass ; CHECK-NEXT: Insert stack protectors ; CHECK-NEXT: Module Verifier Index: test/CodeGen/X86/prefer-avx256-legalization.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/prefer-avx256-legalization.ll @@ -0,0 +1,359 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw,avx512dq | FileCheck %s + +; This file primarily contains tests for specific places in X86ISelLowering.cpp that needed be made aware of the legalizer not allowing 512-bit vectors due to prefer-256-bit even though AVX512 is enabled. + +define void @add(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "prefer-vector-width"="256" { +; CHECK-LABEL: add: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-NEXT: vpaddd (%rsi), %ymm0, %ymm0 +; CHECK-NEXT: vpaddd 32(%rsi), %ymm1, %ymm1 +; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %d = load <16 x i32>, <16 x i32>* %a + %e = load <16 x i32>, <16 x i32>* %b + %f = add <16 x i32> %d, %e + store <16 x i32> %f, <16 x i32>* %c + ret void +} + +define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) "prefer-vector-width"="256" { +; CHECK-LABEL: avg_v64i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa (%rsi), %ymm0 +; CHECK-NEXT: vmovdqa 32(%rsi), %ymm1 +; CHECK-NEXT: vpavgb (%rdi), %ymm0, %ymm0 +; CHECK-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1 +; CHECK-NEXT: vmovdqu %ymm1, (%rax) +; CHECK-NEXT: vmovdqu %ymm0, (%rax) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %1 = load <64 x i8>, <64 x i8>* %a + %2 = load <64 x i8>, <64 x i8>* %b + %3 = zext <64 x i8> %1 to <64 x i32> + %4 = zext <64 x i8> %2 to <64 x i32> + %5 = add nuw nsw <64 x i32> %3, + %6 = add nuw nsw <64 x i32> %5, %4 + %7 = lshr <64 x i32> %6, + %8 = trunc <64 x i32> %7 to <64 x i8> + store <64 x i8> %8, <64 x i8>* undef, align 4 + ret void +} + +define void @pmaddwd_32(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "prefer-vector-width"="256" { +; CHECK-LABEL: pmaddwd_32: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 +; CHECK-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 +; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %A = load <32 x i16>, <32 x i16>* %APtr + %B = load <32 x i16>, <32 x i16>* %BPtr + %a = sext <32 x i16> %A to <32 x i32> + %b = sext <32 x i16> %B to <32 x i32> + %m = mul nsw <32 x i32> %a, %b + %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> + %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> + %ret = add <16 x i32> %odd, %even + store <16 x i32> %ret, <16 x i32>* %CPtr + ret void +} + +define void @psubus_64i8_max(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) "prefer-vector-width"="256" { +; CHECK-LABEL: psubus_64i8_max: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-NEXT: vpsubusb (%rsi), %ymm0, %ymm0 +; CHECK-NEXT: vpsubusb 32(%rsi), %ymm1, %ymm1 +; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %x = load <64 x i8>, <64 x i8>* %xptr + %y = load <64 x i8>, <64 x i8>* %yptr + %cmp = icmp ult <64 x i8> %x, %y + %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x + %res = sub <64 x i8> %max, %y + store <64 x i8> %res, <64 x i8>* %zptr + ret void +} + +define i32 @_Z9test_charPcS_i(i8* nocapture readonly, i8* nocapture readonly, i32) "prefer-vector-width"="256" { +; CHECK-LABEL: _Z9test_charPcS_i: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB4_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vpmovsxbw (%rdi,%rcx), %xmm4 +; CHECK-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm5 +; CHECK-NEXT: vpmovsxbw 16(%rdi,%rcx), %xmm6 +; CHECK-NEXT: vpmovsxbw 24(%rdi,%rcx), %xmm8 +; CHECK-NEXT: vpmovsxbw (%rsi,%rcx), %xmm7 +; CHECK-NEXT: vpmaddwd %xmm4, %xmm7, %xmm4 +; CHECK-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm7 +; CHECK-NEXT: vpmaddwd %xmm5, %xmm7, %xmm5 +; CHECK-NEXT: vpmovsxbw 16(%rsi,%rcx), %xmm7 +; CHECK-NEXT: vpmaddwd %xmm6, %xmm7, %xmm6 +; CHECK-NEXT: vpmovsxbw 24(%rsi,%rcx), %xmm7 +; CHECK-NEXT: vpmaddwd %xmm8, %xmm7, %xmm7 +; CHECK-NEXT: vpaddd %ymm3, %ymm7, %ymm3 +; CHECK-NEXT: vpaddd %ymm2, %ymm6, %ymm2 +; CHECK-NEXT: vpaddd %ymm1, %ymm5, %ymm1 +; CHECK-NEXT: vpaddd %ymm0, %ymm4, %ymm0 +; CHECK-NEXT: addq $32, %rcx +; CHECK-NEXT: cmpq %rcx, %rax +; CHECK-NEXT: jne .LBB4_1 +; CHECK-NEXT: # %bb.2: # %middle.block +; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; CHECK-NEXT: vpaddd %ymm3, %ymm1, %ymm1 +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %3 = zext i32 %2 to i64 + br label %vector.body + +vector.body: + %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] + %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] + %4 = getelementptr inbounds i8, i8* %0, i64 %index + %5 = bitcast i8* %4 to <32 x i8>* + %wide.load = load <32 x i8>, <32 x i8>* %5, align 1 + %6 = sext <32 x i8> %wide.load to <32 x i32> + %7 = getelementptr inbounds i8, i8* %1, i64 %index + %8 = bitcast i8* %7 to <32 x i8>* + %wide.load14 = load <32 x i8>, <32 x i8>* %8, align 1 + %9 = sext <32 x i8> %wide.load14 to <32 x i32> + %10 = mul nsw <32 x i32> %9, %6 + %11 = add nsw <32 x i32> %10, %vec.phi + %index.next = add i64 %index, 32 + %12 = icmp eq i64 %index.next, %3 + br i1 %12, label %middle.block, label %vector.body + +middle.block: + %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> + %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1 + %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> + %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf + %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> + %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15 + %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> + %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17 + %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> + %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19 + %13 = extractelement <32 x i32> %bin.rdx20, i32 0 + ret i32 %13 +} + +@a = global [1024 x i8] zeroinitializer, align 16 +@b = global [1024 x i8] zeroinitializer, align 16 + +define i32 @sad_16i8() "prefer-vector-width"="256" { +; CHECK-LABEL: sad_16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: movq $-1024, %rax # imm = 0xFC00 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB5_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vmovdqu a+1024(%rax), %xmm2 +; CHECK-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2 +; CHECK-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: jne .LBB5_1 +; CHECK-NEXT: # %bb.2: # %middle.block +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] + %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index + %1 = bitcast i8* %0 to <16 x i8>* + %wide.load = load <16 x i8>, <16 x i8>* %1, align 4 + %2 = zext <16 x i8> %wide.load to <16 x i32> + %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index + %4 = bitcast i8* %3 to <16 x i8>* + %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4 + %5 = zext <16 x i8> %wide.load1 to <16 x i32> + %6 = sub nsw <16 x i32> %2, %5 + %7 = icmp sgt <16 x i32> %6, + %8 = sub nsw <16 x i32> zeroinitializer, %6 + %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8 + %10 = add nsw <16 x i32> %9, %vec.phi + %index.next = add i64 %index, 4 + %11 = icmp eq i64 %index.next, 1024 + br i1 %11, label %middle.block, label %vector.body + +middle.block: + %.lcssa = phi <16 x i32> [ %10, %vector.body ] + %rdx.shuf = shufflevector <16 x i32> %.lcssa, <16 x i32> undef, <16 x i32> + %bin.rdx = add <16 x i32> %.lcssa, %rdx.shuf + %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> + %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2 + %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> + %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3 + %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> + %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4 + %12 = extractelement <16 x i32> %bin.rdx4, i32 0 + ret i32 %12 +} + +define void @sbto16f32(<16 x i16> %a, <16 x float>* %res) "prefer-vector-width"="256" { +; CHECK-LABEL: sbto16f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovw2m %ymm0, %k0 +; CHECK-NEXT: kshiftrw $8, %k0, %k1 +; CHECK-NEXT: vpmovm2d %k1, %ymm0 +; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 +; CHECK-NEXT: vpmovm2d %k0, %ymm1 +; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1 +; CHECK-NEXT: vmovaps %ymm1, (%rdi) +; CHECK-NEXT: vmovaps %ymm0, 32(%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %mask = icmp slt <16 x i16> %a, zeroinitializer + %1 = sitofp <16 x i1> %mask to <16 x float> + store <16 x float> %1, <16 x float>* %res + ret void +} + +define void @sbto16f64(<16 x i16> %a, <16 x double>* %res) "prefer-vector-width"="256" { +; CHECK-LABEL: sbto16f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovw2m %ymm0, %k0 +; CHECK-NEXT: kshiftrw $8, %k0, %k1 +; CHECK-NEXT: vpmovm2d %k1, %ymm0 +; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm1 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0 +; CHECK-NEXT: vpmovm2d %k0, %ymm2 +; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm3 +; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2 +; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm2 +; CHECK-NEXT: vmovaps %ymm2, 32(%rdi) +; CHECK-NEXT: vmovaps %ymm3, (%rdi) +; CHECK-NEXT: vmovaps %ymm0, 96(%rdi) +; CHECK-NEXT: vmovaps %ymm1, 64(%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %mask = icmp slt <16 x i16> %a, zeroinitializer + %1 = sitofp <16 x i1> %mask to <16 x double> + store <16 x double> %1, <16 x double>* %res + ret void +} + +define void @ubto16f32(<16 x i16> %a, <16 x float>* %res) "prefer-vector-width"="256" { +; CHECK-LABEL: ubto16f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovw2m %ymm0, %k0 +; CHECK-NEXT: kshiftrw $8, %k0, %k1 +; CHECK-NEXT: vpmovm2d %k1, %ymm0 +; CHECK-NEXT: vpsrld $31, %ymm0, %ymm0 +; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 +; CHECK-NEXT: vpmovm2d %k0, %ymm1 +; CHECK-NEXT: vpsrld $31, %ymm1, %ymm1 +; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1 +; CHECK-NEXT: vmovaps %ymm1, (%rdi) +; CHECK-NEXT: vmovaps %ymm0, 32(%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %mask = icmp slt <16 x i16> %a, zeroinitializer + %1 = uitofp <16 x i1> %mask to <16 x float> + store <16 x float> %1, <16 x float>* %res + ret void +} + +define void @ubto16f64(<16 x i16> %a, <16 x double>* %res) "prefer-vector-width"="256" { +; CHECK-LABEL: ubto16f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovw2m %ymm0, %k0 +; CHECK-NEXT: kshiftrw $8, %k0, %k1 +; CHECK-NEXT: vpmovm2d %k1, %ymm0 +; CHECK-NEXT: vpsrld $31, %ymm0, %ymm0 +; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm1 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0 +; CHECK-NEXT: vpmovm2d %k0, %ymm2 +; CHECK-NEXT: vpsrld $31, %ymm2, %ymm2 +; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm3 +; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2 +; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm2 +; CHECK-NEXT: vmovaps %ymm2, 32(%rdi) +; CHECK-NEXT: vmovaps %ymm3, (%rdi) +; CHECK-NEXT: vmovaps %ymm0, 96(%rdi) +; CHECK-NEXT: vmovaps %ymm1, 64(%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %mask = icmp slt <16 x i16> %a, zeroinitializer + %1 = uitofp <16 x i1> %mask to <16 x double> + store <16 x double> %1, <16 x double>* %res + ret void +} + +define <16 x i16> @test_16f32toub(<16 x float>* %ptr, <16 x i16> %passthru) "prefer-vector-width"="256" { +; CHECK-LABEL: test_16f32toub: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2dq (%rdi), %ymm1 +; CHECK-NEXT: vpmovdw %ymm1, %xmm1 +; CHECK-NEXT: vcvttps2dq 32(%rdi), %ymm2 +; CHECK-NEXT: vpmovdw %ymm2, %xmm2 +; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; CHECK-NEXT: vpsllw $15, %ymm1, %ymm1 +; CHECK-NEXT: vpmovw2m %ymm1, %k1 +; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %a = load <16 x float>, <16 x float>* %ptr + %mask = fptoui <16 x float> %a to <16 x i1> + %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer + ret <16 x i16> %select +} + +define <16 x i16> @test_16f32tosb(<16 x float>* %ptr, <16 x i16> %passthru) "prefer-vector-width"="256" { +; CHECK-LABEL: test_16f32tosb: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2dq (%rdi), %ymm1 +; CHECK-NEXT: vpmovdw %ymm1, %xmm1 +; CHECK-NEXT: vcvttps2dq 32(%rdi), %ymm2 +; CHECK-NEXT: vpmovdw %ymm2, %xmm2 +; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; CHECK-NEXT: vpsllw $15, %ymm1, %ymm1 +; CHECK-NEXT: vpmovw2m %ymm1, %k1 +; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %a = load <16 x float>, <16 x float>* %ptr + %mask = fptosi <16 x float> %a to <16 x i1> + %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer + ret <16 x i16> %select +} Index: test/CodeGen/X86/required-vector-width.ll =================================================================== --- test/CodeGen/X86/required-vector-width.ll +++ /dev/null @@ -1,655 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw,avx512dq,prefer-256-bit | FileCheck %s - -; This file primarily contains tests for specific places in X86ISelLowering.cpp that needed be made aware of the legalizer not allowing 512-bit vectors due to prefer-256-bit even though AVX512 is enabled. - -define void @add256(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "required-vector-width"="256" { -; CHECK-LABEL: add256: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm0 -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vpaddd (%rsi), %ymm0, %ymm0 -; CHECK-NEXT: vpaddd 32(%rsi), %ymm1, %ymm1 -; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) -; CHECK-NEXT: vmovdqa %ymm0, (%rdx) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq - %d = load <16 x i32>, <16 x i32>* %a - %e = load <16 x i32>, <16 x i32>* %b - %f = add <16 x i32> %d, %e - store <16 x i32> %f, <16 x i32>* %c - ret void -} - -define void @add512(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "required-vector-width"="512" { -; CHECK-LABEL: add512: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 -; CHECK-NEXT: vpaddd (%rsi), %zmm0, %zmm0 -; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq - %d = load <16 x i32>, <16 x i32>* %a - %e = load <16 x i32>, <16 x i32>* %b - %f = add <16 x i32> %d, %e - store <16 x i32> %f, <16 x i32>* %c - ret void -} - -define void @avg_v64i8_256(<64 x i8>* %a, <64 x i8>* %b) "required-vector-width"="256" { -; CHECK-LABEL: avg_v64i8_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rsi), %ymm0 -; CHECK-NEXT: vmovdqa 32(%rsi), %ymm1 -; CHECK-NEXT: vpavgb (%rdi), %ymm0, %ymm0 -; CHECK-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1 -; CHECK-NEXT: vmovdqu %ymm1, (%rax) -; CHECK-NEXT: vmovdqu %ymm0, (%rax) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq - %1 = load <64 x i8>, <64 x i8>* %a - %2 = load <64 x i8>, <64 x i8>* %b - %3 = zext <64 x i8> %1 to <64 x i32> - %4 = zext <64 x i8> %2 to <64 x i32> - %5 = add nuw nsw <64 x i32> %3, - %6 = add nuw nsw <64 x i32> %5, %4 - %7 = lshr <64 x i32> %6, - %8 = trunc <64 x i32> %7 to <64 x i8> - store <64 x i8> %8, <64 x i8>* undef, align 4 - ret void -} - - -define void @avg_v64i8_512(<64 x i8>* %a, <64 x i8>* %b) "required-vector-width"="512" { -; CHECK-LABEL: avg_v64i8_512: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rsi), %zmm0 -; CHECK-NEXT: vpavgb (%rdi), %zmm0, %zmm0 -; CHECK-NEXT: vmovdqu64 %zmm0, (%rax) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq - %1 = load <64 x i8>, <64 x i8>* %a - %2 = load <64 x i8>, <64 x i8>* %b - %3 = zext <64 x i8> %1 to <64 x i32> - %4 = zext <64 x i8> %2 to <64 x i32> - %5 = add nuw nsw <64 x i32> %3, - %6 = add nuw nsw <64 x i32> %5, %4 - %7 = lshr <64 x i32> %6, - %8 = trunc <64 x i32> %7 to <64 x i8> - store <64 x i8> %8, <64 x i8>* undef, align 4 - ret void -} - -define void @pmaddwd_32_256(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "required-vector-width"="256" { -; CHECK-LABEL: pmaddwd_32_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm0 -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 -; CHECK-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 -; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) -; CHECK-NEXT: vmovdqa %ymm0, (%rdx) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq - %A = load <32 x i16>, <32 x i16>* %APtr - %B = load <32 x i16>, <32 x i16>* %BPtr - %a = sext <32 x i16> %A to <32 x i32> - %b = sext <32 x i16> %B to <32 x i32> - %m = mul nsw <32 x i32> %a, %b - %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> - %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> - %ret = add <16 x i32> %odd, %even - store <16 x i32> %ret, <16 x i32>* %CPtr - ret void -} - -define void @pmaddwd_32_512(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "required-vector-width"="512" { -; CHECK-LABEL: pmaddwd_32_512: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 -; CHECK-NEXT: vpmaddwd (%rsi), %zmm0, %zmm0 -; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq - %A = load <32 x i16>, <32 x i16>* %APtr - %B = load <32 x i16>, <32 x i16>* %BPtr - %a = sext <32 x i16> %A to <32 x i32> - %b = sext <32 x i16> %B to <32 x i32> - %m = mul nsw <32 x i32> %a, %b - %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> - %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> - %ret = add <16 x i32> %odd, %even - store <16 x i32> %ret, <16 x i32>* %CPtr - ret void -} - -define void @psubus_64i8_max_256(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) "required-vector-width"="256" { -; CHECK-LABEL: psubus_64i8_max_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm0 -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vpsubusb (%rsi), %ymm0, %ymm0 -; CHECK-NEXT: vpsubusb 32(%rsi), %ymm1, %ymm1 -; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) -; CHECK-NEXT: vmovdqa %ymm0, (%rdx) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq - %x = load <64 x i8>, <64 x i8>* %xptr - %y = load <64 x i8>, <64 x i8>* %yptr - %cmp = icmp ult <64 x i8> %x, %y - %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x - %res = sub <64 x i8> %max, %y - store <64 x i8> %res, <64 x i8>* %zptr - ret void -} - -define void @psubus_64i8_max_512(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) "required-vector-width"="512" { -; CHECK-LABEL: psubus_64i8_max_512: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 -; CHECK-NEXT: vpsubusb (%rsi), %zmm0, %zmm0 -; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq - %x = load <64 x i8>, <64 x i8>* %xptr - %y = load <64 x i8>, <64 x i8>* %yptr - %cmp = icmp ult <64 x i8> %x, %y - %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x - %res = sub <64 x i8> %max, %y - store <64 x i8> %res, <64 x i8>* %zptr - ret void -} - -define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly, i32) "required-vector-width"="256" { -; CHECK-LABEL: _Z9test_charPcS_i_256: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl %edx, %eax -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB8_1: # %vector.body -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpmovsxbw (%rdi,%rcx), %xmm4 -; CHECK-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm5 -; CHECK-NEXT: vpmovsxbw 16(%rdi,%rcx), %xmm6 -; CHECK-NEXT: vpmovsxbw 24(%rdi,%rcx), %xmm8 -; CHECK-NEXT: vpmovsxbw (%rsi,%rcx), %xmm7 -; CHECK-NEXT: vpmaddwd %xmm4, %xmm7, %xmm4 -; CHECK-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm7 -; CHECK-NEXT: vpmaddwd %xmm5, %xmm7, %xmm5 -; CHECK-NEXT: vpmovsxbw 16(%rsi,%rcx), %xmm7 -; CHECK-NEXT: vpmaddwd %xmm6, %xmm7, %xmm6 -; CHECK-NEXT: vpmovsxbw 24(%rsi,%rcx), %xmm7 -; CHECK-NEXT: vpmaddwd %xmm8, %xmm7, %xmm7 -; CHECK-NEXT: vpaddd %ymm3, %ymm7, %ymm3 -; CHECK-NEXT: vpaddd %ymm2, %ymm6, %ymm2 -; CHECK-NEXT: vpaddd %ymm1, %ymm5, %ymm1 -; CHECK-NEXT: vpaddd %ymm0, %ymm4, %ymm0 -; CHECK-NEXT: addq $32, %rcx -; CHECK-NEXT: cmpq %rcx, %rax -; CHECK-NEXT: jne .LBB8_1 -; CHECK-NEXT: # %bb.2: # %middle.block -; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; CHECK-NEXT: vpaddd %ymm3, %ymm1, %ymm1 -; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vphaddd %ymm0, %ymm0, %ymm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq -entry: - %3 = zext i32 %2 to i64 - br label %vector.body - -vector.body: - %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] - %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] - %4 = getelementptr inbounds i8, i8* %0, i64 %index - %5 = bitcast i8* %4 to <32 x i8>* - %wide.load = load <32 x i8>, <32 x i8>* %5, align 1 - %6 = sext <32 x i8> %wide.load to <32 x i32> - %7 = getelementptr inbounds i8, i8* %1, i64 %index - %8 = bitcast i8* %7 to <32 x i8>* - %wide.load14 = load <32 x i8>, <32 x i8>* %8, align 1 - %9 = sext <32 x i8> %wide.load14 to <32 x i32> - %10 = mul nsw <32 x i32> %9, %6 - %11 = add nsw <32 x i32> %10, %vec.phi - %index.next = add i64 %index, 32 - %12 = icmp eq i64 %index.next, %3 - br i1 %12, label %middle.block, label %vector.body - -middle.block: - %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> - %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1 - %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> - %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf - %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> - %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15 - %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> - %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17 - %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> - %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19 - %13 = extractelement <32 x i32> %bin.rdx20, i32 0 - ret i32 %13 -} - -define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly, i32) "required-vector-width"="512" { -; CHECK-LABEL: _Z9test_charPcS_i_512: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl %edx, %eax -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB9_1: # %vector.body -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2 -; CHECK-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3 -; CHECK-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2 -; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1 -; CHECK-NEXT: addq $32, %rcx -; CHECK-NEXT: cmpq %rcx, %rax -; CHECK-NEXT: jne .LBB9_1 -; CHECK-NEXT: # %bb.2: # %middle.block -; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq -entry: - %3 = zext i32 %2 to i64 - br label %vector.body - -vector.body: - %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] - %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] - %4 = getelementptr inbounds i8, i8* %0, i64 %index - %5 = bitcast i8* %4 to <32 x i8>* - %wide.load = load <32 x i8>, <32 x i8>* %5, align 1 - %6 = sext <32 x i8> %wide.load to <32 x i32> - %7 = getelementptr inbounds i8, i8* %1, i64 %index - %8 = bitcast i8* %7 to <32 x i8>* - %wide.load14 = load <32 x i8>, <32 x i8>* %8, align 1 - %9 = sext <32 x i8> %wide.load14 to <32 x i32> - %10 = mul nsw <32 x i32> %9, %6 - %11 = add nsw <32 x i32> %10, %vec.phi - %index.next = add i64 %index, 32 - %12 = icmp eq i64 %index.next, %3 - br i1 %12, label %middle.block, label %vector.body - -middle.block: - %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> - %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1 - %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> - %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf - %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> - %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15 - %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> - %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17 - %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> - %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19 - %13 = extractelement <32 x i32> %bin.rdx20, i32 0 - ret i32 %13 -} - -@a = global [1024 x i8] zeroinitializer, align 16 -@b = global [1024 x i8] zeroinitializer, align 16 - -define i32 @sad_16i8_256() "required-vector-width"="256" { -; CHECK-LABEL: sad_16i8_256: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: movq $-1024, %rax # imm = 0xFC00 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB10_1: # %vector.body -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu a+1024(%rax), %xmm2 -; CHECK-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2 -; CHECK-NEXT: vpaddd %ymm1, %ymm2, %ymm1 -; CHECK-NEXT: addq $4, %rax -; CHECK-NEXT: jne .LBB10_1 -; CHECK-NEXT: # %bb.2: # %middle.block -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vphaddd %ymm0, %ymm0, %ymm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq -entry: - br label %vector.body - -vector.body: - %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] - %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] - %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index - %1 = bitcast i8* %0 to <16 x i8>* - %wide.load = load <16 x i8>, <16 x i8>* %1, align 4 - %2 = zext <16 x i8> %wide.load to <16 x i32> - %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index - %4 = bitcast i8* %3 to <16 x i8>* - %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4 - %5 = zext <16 x i8> %wide.load1 to <16 x i32> - %6 = sub nsw <16 x i32> %2, %5 - %7 = icmp sgt <16 x i32> %6, - %8 = sub nsw <16 x i32> zeroinitializer, %6 - %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8 - %10 = add nsw <16 x i32> %9, %vec.phi - %index.next = add i64 %index, 4 - %11 = icmp eq i64 %index.next, 1024 - br i1 %11, label %middle.block, label %vector.body - -middle.block: - %.lcssa = phi <16 x i32> [ %10, %vector.body ] - %rdx.shuf = shufflevector <16 x i32> %.lcssa, <16 x i32> undef, <16 x i32> - %bin.rdx = add <16 x i32> %.lcssa, %rdx.shuf - %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> - %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2 - %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> - %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3 - %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> - %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4 - %12 = extractelement <16 x i32> %bin.rdx4, i32 0 - ret i32 %12 -} - -define i32 @sad_16i8_512() "required-vector-width"="512" { -; CHECK-LABEL: sad_16i8_512: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: movq $-1024, %rax # imm = 0xFC00 -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB11_1: # %vector.body -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu a+1024(%rax), %xmm1 -; CHECK-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1 -; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: addq $4, %rax -; CHECK-NEXT: jne .LBB11_1 -; CHECK-NEXT: # %bb.2: # %middle.block -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq -entry: - br label %vector.body - -vector.body: - %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] - %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] - %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index - %1 = bitcast i8* %0 to <16 x i8>* - %wide.load = load <16 x i8>, <16 x i8>* %1, align 4 - %2 = zext <16 x i8> %wide.load to <16 x i32> - %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index - %4 = bitcast i8* %3 to <16 x i8>* - %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4 - %5 = zext <16 x i8> %wide.load1 to <16 x i32> - %6 = sub nsw <16 x i32> %2, %5 - %7 = icmp sgt <16 x i32> %6, - %8 = sub nsw <16 x i32> zeroinitializer, %6 - %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8 - %10 = add nsw <16 x i32> %9, %vec.phi - %index.next = add i64 %index, 4 - %11 = icmp eq i64 %index.next, 1024 - br i1 %11, label %middle.block, label %vector.body - -middle.block: - %.lcssa = phi <16 x i32> [ %10, %vector.body ] - %rdx.shuf = shufflevector <16 x i32> %.lcssa, <16 x i32> undef, <16 x i32> - %bin.rdx = add <16 x i32> %.lcssa, %rdx.shuf - %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> - %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2 - %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> - %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3 - %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> - %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4 - %12 = extractelement <16 x i32> %bin.rdx4, i32 0 - ret i32 %12 -} - -define void @sbto16f32_256(<16 x i16> %a, <16 x float>* %res) "required-vector-width"="256" { -; CHECK-LABEL: sbto16f32_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vpmovw2m %ymm0, %k0 -; CHECK-NEXT: kshiftrw $8, %k0, %k1 -; CHECK-NEXT: vpmovm2d %k1, %ymm0 -; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 -; CHECK-NEXT: vpmovm2d %k0, %ymm1 -; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1 -; CHECK-NEXT: vmovaps %ymm1, (%rdi) -; CHECK-NEXT: vmovaps %ymm0, 32(%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq - %mask = icmp slt <16 x i16> %a, zeroinitializer - %1 = sitofp <16 x i1> %mask to <16 x float> - store <16 x float> %1, <16 x float>* %res - ret void -} - -define void @sbto16f32_512(<16 x i16> %a, <16 x float>* %res) "required-vector-width"="512" { -; CHECK-LABEL: sbto16f32_512: -; CHECK: # %bb.0: -; CHECK-NEXT: vpmovw2m %ymm0, %k0 -; CHECK-NEXT: vpmovm2d %k0, %zmm0 -; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0 -; CHECK-NEXT: vmovaps %zmm0, (%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq - %mask = icmp slt <16 x i16> %a, zeroinitializer - %1 = sitofp <16 x i1> %mask to <16 x float> - store <16 x float> %1, <16 x float>* %res - ret void -} - -define void @sbto16f64_256(<16 x i16> %a, <16 x double>* %res) "required-vector-width"="256" { -; CHECK-LABEL: sbto16f64_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vpmovw2m %ymm0, %k0 -; CHECK-NEXT: kshiftrw $8, %k0, %k1 -; CHECK-NEXT: vpmovm2d %k1, %ymm0 -; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0 -; CHECK-NEXT: vpmovm2d %k0, %ymm2 -; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm3 -; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2 -; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm2 -; CHECK-NEXT: vmovaps %ymm2, 32(%rdi) -; CHECK-NEXT: vmovaps %ymm3, (%rdi) -; CHECK-NEXT: vmovaps %ymm0, 96(%rdi) -; CHECK-NEXT: vmovaps %ymm1, 64(%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq - %mask = icmp slt <16 x i16> %a, zeroinitializer - %1 = sitofp <16 x i1> %mask to <16 x double> - store <16 x double> %1, <16 x double>* %res - ret void -} - -define void @sbto16f64_512(<16 x i16> %a, <16 x double>* %res) "required-vector-width"="512" { -; CHECK-LABEL: sbto16f64_512: -; CHECK: # %bb.0: -; CHECK-NEXT: vpmovw2m %ymm0, %k0 -; CHECK-NEXT: vpmovm2d %k0, %zmm0 -; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm1 -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0 -; CHECK-NEXT: vmovaps %zmm0, 64(%rdi) -; CHECK-NEXT: vmovaps %zmm1, (%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq - %mask = icmp slt <16 x i16> %a, zeroinitializer - %1 = sitofp <16 x i1> %mask to <16 x double> - store <16 x double> %1, <16 x double>* %res - ret void -} - -define void @ubto16f32_256(<16 x i16> %a, <16 x float>* %res) "required-vector-width"="256" { -; CHECK-LABEL: ubto16f32_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vpmovw2m %ymm0, %k0 -; CHECK-NEXT: kshiftrw $8, %k0, %k1 -; CHECK-NEXT: vpmovm2d %k1, %ymm0 -; CHECK-NEXT: vpsrld $31, %ymm0, %ymm0 -; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 -; CHECK-NEXT: vpmovm2d %k0, %ymm1 -; CHECK-NEXT: vpsrld $31, %ymm1, %ymm1 -; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1 -; CHECK-NEXT: vmovaps %ymm1, (%rdi) -; CHECK-NEXT: vmovaps %ymm0, 32(%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq - %mask = icmp slt <16 x i16> %a, zeroinitializer - %1 = uitofp <16 x i1> %mask to <16 x float> - store <16 x float> %1, <16 x float>* %res - ret void -} - -define void @ubto16f32_512(<16 x i16> %a, <16 x float>* %res) "required-vector-width"="512" { -; CHECK-LABEL: ubto16f32_512: -; CHECK: # %bb.0: -; CHECK-NEXT: vpmovw2m %ymm0, %k0 -; CHECK-NEXT: vpmovm2d %k0, %zmm0 -; CHECK-NEXT: vpsrld $31, %zmm0, %zmm0 -; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0 -; CHECK-NEXT: vmovaps %zmm0, (%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq - %mask = icmp slt <16 x i16> %a, zeroinitializer - %1 = uitofp <16 x i1> %mask to <16 x float> - store <16 x float> %1, <16 x float>* %res - ret void -} - -define void @ubto16f64_256(<16 x i16> %a, <16 x double>* %res) "required-vector-width"="256" { -; CHECK-LABEL: ubto16f64_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vpmovw2m %ymm0, %k0 -; CHECK-NEXT: kshiftrw $8, %k0, %k1 -; CHECK-NEXT: vpmovm2d %k1, %ymm0 -; CHECK-NEXT: vpsrld $31, %ymm0, %ymm0 -; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0 -; CHECK-NEXT: vpmovm2d %k0, %ymm2 -; CHECK-NEXT: vpsrld $31, %ymm2, %ymm2 -; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm3 -; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2 -; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm2 -; CHECK-NEXT: vmovaps %ymm2, 32(%rdi) -; CHECK-NEXT: vmovaps %ymm3, (%rdi) -; CHECK-NEXT: vmovaps %ymm0, 96(%rdi) -; CHECK-NEXT: vmovaps %ymm1, 64(%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq - %mask = icmp slt <16 x i16> %a, zeroinitializer - %1 = uitofp <16 x i1> %mask to <16 x double> - store <16 x double> %1, <16 x double>* %res - ret void -} - -define void @ubto16f64_512(<16 x i16> %a, <16 x double>* %res) "required-vector-width"="512" { -; CHECK-LABEL: ubto16f64_512: -; CHECK: # %bb.0: -; CHECK-NEXT: vpmovw2m %ymm0, %k0 -; CHECK-NEXT: vpmovm2d %k0, %zmm0 -; CHECK-NEXT: vpsrld $31, %zmm0, %zmm0 -; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm1 -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0 -; CHECK-NEXT: vmovaps %zmm0, 64(%rdi) -; CHECK-NEXT: vmovaps %zmm1, (%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq - %mask = icmp slt <16 x i16> %a, zeroinitializer - %1 = uitofp <16 x i1> %mask to <16 x double> - store <16 x double> %1, <16 x double>* %res - ret void -} - -define <16 x i16> @test_16f32toub_256(<16 x float>* %ptr, <16 x i16> %passthru) "required-vector-width"="256" { -; CHECK-LABEL: test_16f32toub_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vcvttps2dq (%rdi), %ymm1 -; CHECK-NEXT: vpmovdw %ymm1, %xmm1 -; CHECK-NEXT: vcvttps2dq 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovdw %ymm2, %xmm2 -; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; CHECK-NEXT: vpsllw $15, %ymm1, %ymm1 -; CHECK-NEXT: vpmovw2m %ymm1, %k1 -; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq - %a = load <16 x float>, <16 x float>* %ptr - %mask = fptoui <16 x float> %a to <16 x i1> - %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer - ret <16 x i16> %select -} - -define <16 x i16> @test_16f32toub_512(<16 x float>* %ptr, <16 x i16> %passthru) "required-vector-width"="512" { -; CHECK-LABEL: test_16f32toub_512: -; CHECK: # %bb.0: -; CHECK-NEXT: vcvttps2dq (%rdi), %zmm1 -; CHECK-NEXT: vpslld $31, %zmm1, %zmm1 -; CHECK-NEXT: vpmovd2m %zmm1, %k1 -; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq - %a = load <16 x float>, <16 x float>* %ptr - %mask = fptoui <16 x float> %a to <16 x i1> - %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer - ret <16 x i16> %select -} - -define <16 x i16> @test_16f32tosb_256(<16 x float>* %ptr, <16 x i16> %passthru) "required-vector-width"="256" { -; CHECK-LABEL: test_16f32tosb_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vcvttps2dq (%rdi), %ymm1 -; CHECK-NEXT: vpmovdw %ymm1, %xmm1 -; CHECK-NEXT: vcvttps2dq 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovdw %ymm2, %xmm2 -; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; CHECK-NEXT: vpsllw $15, %ymm1, %ymm1 -; CHECK-NEXT: vpmovw2m %ymm1, %k1 -; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq - %a = load <16 x float>, <16 x float>* %ptr - %mask = fptosi <16 x float> %a to <16 x i1> - %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer - ret <16 x i16> %select -} - -define <16 x i16> @test_16f32tosb_512(<16 x float>* %ptr, <16 x i16> %passthru) "required-vector-width"="512" { -; CHECK-LABEL: test_16f32tosb_512: -; CHECK: # %bb.0: -; CHECK-NEXT: vcvttps2dq (%rdi), %zmm1 -; CHECK-NEXT: vpmovd2m %zmm1, %k1 -; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq - %a = load <16 x float>, <16 x float>* %ptr - %mask = fptosi <16 x float> %a to <16 x i1> - %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer - ret <16 x i16> %select -}