Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1138,11 +1138,6 @@ } if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { - addRegisterClass(MVT::v16i32, &X86::VR512RegClass); - addRegisterClass(MVT::v16f32, &X86::VR512RegClass); - addRegisterClass(MVT::v8i64, &X86::VR512RegClass); - addRegisterClass(MVT::v8f64, &X86::VR512RegClass); - addRegisterClass(MVT::v1i1, &X86::VK1RegClass); addRegisterClass(MVT::v2i1, &X86::VK2RegClass); addRegisterClass(MVT::v4i1, &X86::VK4RegClass); @@ -1204,6 +1199,13 @@ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom); for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 }) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + } + + if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) { + addRegisterClass(MVT::v16i32, &X86::VR512RegClass); + addRegisterClass(MVT::v16f32, &X86::VR512RegClass); + addRegisterClass(MVT::v8i64, &X86::VR512RegClass); + addRegisterClass(MVT::v8f64, &X86::VR512RegClass); for (MVT VT : MVT::fp_vector_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal); @@ -1356,8 +1358,7 @@ } }// has AVX-512 - if (!Subtarget.useSoftFloat() && - (Subtarget.hasAVX512() || Subtarget.hasVLX())) { + if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { // These operations are handled on non-VLX by artificially widening in // isel patterns. // TODO: Custom widen in lowering on non-VLX and drop the isel patterns? @@ -1412,13 +1413,9 @@ } if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) { - addRegisterClass(MVT::v32i16, &X86::VR512RegClass); - addRegisterClass(MVT::v64i8, &X86::VR512RegClass); - addRegisterClass(MVT::v32i1, &X86::VK32RegClass); - addRegisterClass(MVT::v64i1, &X86::VK64RegClass); - for (auto VT : { MVT::v32i1, MVT::v64i1 }) { + for (auto VT : { MVT::v32i1 }) { setOperationAction(ISD::ADD, VT, Custom); setOperationAction(ISD::SUB, VT, Custom); setOperationAction(ISD::MUL, VT, Custom); @@ -1434,16 +1431,40 @@ } setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom); - for (auto VT : { MVT::v16i1, MVT::v32i1 }) - setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i1, Custom); // Extends from v32i1 masks to 256-bit vectors. setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom); + } + + if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) { + addRegisterClass(MVT::v32i16, &X86::VR512RegClass); + addRegisterClass(MVT::v64i8, &X86::VR512RegClass); + + addRegisterClass(MVT::v64i1, &X86::VK64RegClass); + + for (auto VT : { MVT::v64i1 }) { + setOperationAction(ISD::ADD, VT, Custom); + setOperationAction(ISD::SUB, VT, Custom); + setOperationAction(ISD::MUL, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Expand); + + setOperationAction(ISD::TRUNCATE, VT, Custom); + setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + } + + setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i1, Custom); + // Extends from v64i1 masks to 512-bit vectors. setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom); @@ -1509,8 +1530,7 @@ } } - if (!Subtarget.useSoftFloat() && Subtarget.hasBWI() && - (Subtarget.hasAVX512() || Subtarget.hasVLX())) { + if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) { for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) { setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom); @@ -29985,7 +30005,7 @@ EVT VT = N->getValueType(0); if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) && - (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64))) + (!Subtarget.useAVX512Regs() || (VT != MVT::v16f32 && VT != MVT::v8f64))) return false; // We only handle target-independent shuffles. @@ -30935,7 +30955,7 @@ return SDValue(); unsigned RegSize = 128; - if (Subtarget.hasBWI()) + if (Subtarget.useBWIRegs()) RegSize = 512; else if (Subtarget.hasAVX2()) RegSize = 256; @@ -32599,7 +32619,7 @@ if (Subtarget.getProcFamily() != X86Subtarget::IntelKNL && ((VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasAVX2()) || - (VT == MVT::v16i32 && Subtarget.hasBWI()))) { + (VT == MVT::v16i32 && Subtarget.useBWIRegs()))) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); APInt Mask17 = APInt::getHighBitsSet(32, 17); @@ -33998,6 +34018,11 @@ if (!SrcVT.isVector()) return false; + // Don't allow 512-bit vectors if they aren't supported. + // TODO: This function feels like it could use make use of isLegalType. + if (SrcVT.is512BitVector() && !Subtarget.useAVX512Regs()) + return false; + EVT SrcElVT = SrcVT.getScalarType(); EVT DstElVT = DstVT.getScalarType(); if (DstElVT != MVT::i8 && DstElVT != MVT::i16 && DstElVT != MVT::i32) @@ -34108,7 +34133,7 @@ SDValue Op1, F Builder) { assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2"); unsigned NumSubs = 1; - if (Subtarget.hasBWI()) { + if (Subtarget.useBWIRegs()) { if (VT.getSizeInBits() > 512) { NumSubs = VT.getSizeInBits() / 512; assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size"); @@ -36099,7 +36124,7 @@ // Also use this if we don't have SSE41 to allow the legalizer do its job. if (!Subtarget.hasSSE41() || VT.is128BitVector() || (VT.is256BitVector() && Subtarget.hasInt256()) || - (VT.is512BitVector() && Subtarget.hasAVX512())) { + (VT.is512BitVector() && Subtarget.useAVX512Regs())) { SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits()); return Opcode == ISD::SIGN_EXTEND ? DAG.getSignExtendVectorInReg(ExOp, DL, VT) @@ -36132,7 +36157,7 @@ // On pre-AVX512 targets, split into 256-bit nodes of // ISD::*_EXTEND_VECTOR_INREG. - if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256)) + if (!Subtarget.useAVX512Regs() && !(VT.getSizeInBits() % 256)) return SplitAndExtendInReg(256); return SDValue(); @@ -37090,7 +37115,7 @@ EVT VT = N->getValueType(0); unsigned RegSize = 128; - if (Subtarget.hasBWI()) + if (Subtarget.useBWIRegs()) RegSize = 512; else if (Subtarget.hasAVX2()) RegSize = 256; @@ -37135,7 +37160,7 @@ return SDValue(); unsigned RegSize = 128; - if (Subtarget.hasBWI()) + if (Subtarget.useBWIRegs()) RegSize = 512; else if (Subtarget.hasAVX2()) RegSize = 256; @@ -37363,8 +37388,8 @@ if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) && !(Subtarget.hasSSE41() && (VT == MVT::v8i32)) && !(Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)) && - !(Subtarget.hasBWI() && (VT == MVT::v64i8 || VT == MVT::v32i16 || - VT == MVT::v16i32 || VT == MVT::v8i64))) + !(Subtarget.useBWIRegs() && (VT == MVT::v64i8 || VT == MVT::v32i16 || + VT == MVT::v16i32 || VT == MVT::v8i64))) return SDValue(); SDValue SubusLHS, SubusRHS; Index: lib/Target/X86/X86Subtarget.h =================================================================== --- lib/Target/X86/X86Subtarget.h +++ lib/Target/X86/X86Subtarget.h @@ -407,6 +407,9 @@ /// features. unsigned PreferVectorWidth; + /// Required vector width from function attribute. + unsigned RequiredVectorWidth; + /// True if compiling for 64-bit, false for 16-bit or 32-bit. bool In64BitMode; @@ -433,7 +436,8 @@ /// X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS, const X86TargetMachine &TM, unsigned StackAlignOverride, - unsigned PreferVectorWidthOverride); + unsigned PreferVectorWidthOverride, + unsigned RequiredVectorWidth); const X86TargetLowering *getTargetLowering() const override { return &TLInfo; @@ -622,6 +626,7 @@ bool useRetpolineExternalThunk() const { return UseRetpolineExternalThunk; } unsigned getPreferVectorWidth() const { return PreferVectorWidth; } + unsigned getRequiredVectorWidth() const { return RequiredVectorWidth; } // Helper functions to determine when we should allow widening to 512-bit // during codegen. @@ -634,6 +639,16 @@ return hasBWI() && canExtendTo512DQ(); } + // If there are no 512-bit vectors and we prefer not to use 512-bit registers, + // disable them in the legalizer. + bool useAVX512Regs() const { + return hasAVX512() && (canExtendTo512DQ() || RequiredVectorWidth > 256); + } + + bool useBWIRegs() const { + return hasBWI() && useAVX512Regs(); + } + bool isXRaySupported() const override { return is64Bit(); } X86ProcFamilyEnum getProcFamily() const { return X86ProcFamily; } Index: lib/Target/X86/X86Subtarget.cpp =================================================================== --- lib/Target/X86/X86Subtarget.cpp +++ lib/Target/X86/X86Subtarget.cpp @@ -373,11 +373,13 @@ X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS, const X86TargetMachine &TM, unsigned StackAlignOverride, - unsigned PreferVectorWidthOverride) + unsigned PreferVectorWidthOverride, + unsigned RequiredVectorWidth) : X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others), PICStyle(PICStyles::None), TM(TM), TargetTriple(TT), StackAlignOverride(StackAlignOverride), PreferVectorWidthOverride(PreferVectorWidthOverride), + RequiredVectorWidth(RequiredVectorWidth), In64BitMode(TargetTriple.getArch() == Triple::x86_64), In32BitMode(TargetTriple.getArch() == Triple::x86 && TargetTriple.getEnvironment() != Triple::CODE16), Index: lib/Target/X86/X86TargetMachine.cpp =================================================================== --- lib/Target/X86/X86TargetMachine.cpp +++ lib/Target/X86/X86TargetMachine.cpp @@ -259,8 +259,7 @@ // the feature string out later. unsigned CPUFSWidth = Key.size(); - // Translate vector width function attribute into subtarget features. This - // overrides any CPU specific turning parameter + // Extract prefer-vector-width attribute. unsigned PreferVectorWidthOverride = 0; if (F.hasFnAttribute("prefer-vector-width")) { StringRef Val = F.getFnAttribute("prefer-vector-width").getValueAsString(); @@ -272,6 +271,21 @@ } } + // Extract required-vector-width attribute. + unsigned RequiredVectorWidth = UINT32_MAX; + if (F.hasFnAttribute("required-vector-width")) { + StringRef Val = F.getFnAttribute("required-vector-width").getValueAsString(); + unsigned Width; + if (!Val.getAsInteger(0, Width)) { + Key += ",required-vector-width="; + Key += Val; + RequiredVectorWidth = Width; + } + } + + // Extracted here so that we make sure there is backing for the StringRef. If + // we assigned earlier, its possible the SmallString reallocated leaving a + // dangling StringRef. FS = Key.slice(CPU.size(), CPUFSWidth); auto &I = SubtargetMap[Key]; @@ -282,7 +296,8 @@ resetTargetOptions(F); I = llvm::make_unique(TargetTriple, CPU, FS, *this, Options.StackAlignmentOverride, - PreferVectorWidthOverride); + PreferVectorWidthOverride, + RequiredVectorWidth); } return I.get(); } Index: test/CodeGen/X86/required-vector-width.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/required-vector-width.ll @@ -0,0 +1,432 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw,avx512dq,prefer-256-bit | FileCheck %s + +; This file primarily contains tests for specific places in X86ISelLowering.cpp that needed be made aware of the legalizer not allowing 512-bit vectors due to prefer-256-bit even though AVX512 is enabled. + +define void @add256(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "required-vector-width"="256" { +; CHECK-LABEL: add256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-NEXT: vpaddd (%rsi), %ymm0, %ymm0 +; CHECK-NEXT: vpaddd 32(%rsi), %ymm1, %ymm1 +; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %d = load <16 x i32>, <16 x i32>* %a + %e = load <16 x i32>, <16 x i32>* %b + %f = add <16 x i32> %d, %e + store <16 x i32> %f, <16 x i32>* %c + ret void +} + +define void @add512(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "required-vector-width"="512" { +; CHECK-LABEL: add512: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpaddd (%rsi), %zmm0, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %d = load <16 x i32>, <16 x i32>* %a + %e = load <16 x i32>, <16 x i32>* %b + %f = add <16 x i32> %d, %e + store <16 x i32> %f, <16 x i32>* %c + ret void +} + +define void @avg_v64i8_256(<64 x i8>* %a, <64 x i8>* %b) "required-vector-width"="256" { +; CHECK-LABEL: avg_v64i8_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm0 +; CHECK-NEXT: vmovdqa (%rsi), %ymm1 +; CHECK-NEXT: vpavgb (%rdi), %ymm1, %ymm1 +; CHECK-NEXT: vpavgb 32(%rsi), %ymm0, %ymm0 +; CHECK-NEXT: vmovdqu %ymm0, (%rax) +; CHECK-NEXT: vmovdqu %ymm1, (%rax) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %1 = load <64 x i8>, <64 x i8>* %a + %2 = load <64 x i8>, <64 x i8>* %b + %3 = zext <64 x i8> %1 to <64 x i32> + %4 = zext <64 x i8> %2 to <64 x i32> + %5 = add nuw nsw <64 x i32> %3, + %6 = add nuw nsw <64 x i32> %5, %4 + %7 = lshr <64 x i32> %6, + %8 = trunc <64 x i32> %7 to <64 x i8> + store <64 x i8> %8, <64 x i8>* undef, align 4 + ret void +} + + +define void @avg_v64i8_512(<64 x i8>* %a, <64 x i8>* %b) "required-vector-width"="512" { +; CHECK-LABEL: avg_v64i8_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa64 (%rsi), %zmm0 +; CHECK-NEXT: vpavgb (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: vmovdqu64 %zmm0, (%rax) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %1 = load <64 x i8>, <64 x i8>* %a + %2 = load <64 x i8>, <64 x i8>* %b + %3 = zext <64 x i8> %1 to <64 x i32> + %4 = zext <64 x i8> %2 to <64 x i32> + %5 = add nuw nsw <64 x i32> %3, + %6 = add nuw nsw <64 x i32> %5, %4 + %7 = lshr <64 x i32> %6, + %8 = trunc <64 x i32> %7 to <64 x i8> + store <64 x i8> %8, <64 x i8>* undef, align 4 + ret void +} + +define void @pmaddwd_32_256(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "required-vector-width"="256" { +; CHECK-LABEL: pmaddwd_32_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 +; CHECK-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 +; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %A = load <32 x i16>, <32 x i16>* %APtr + %B = load <32 x i16>, <32 x i16>* %BPtr + %a = sext <32 x i16> %A to <32 x i32> + %b = sext <32 x i16> %B to <32 x i32> + %m = mul nsw <32 x i32> %a, %b + %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> + %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> + %ret = add <16 x i32> %odd, %even + store <16 x i32> %ret, <16 x i32>* %CPtr + ret void +} + +define void @pmaddwd_32_512(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "required-vector-width"="512" { +; CHECK-LABEL: pmaddwd_32_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpmaddwd (%rsi), %zmm0, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %A = load <32 x i16>, <32 x i16>* %APtr + %B = load <32 x i16>, <32 x i16>* %BPtr + %a = sext <32 x i16> %A to <32 x i32> + %b = sext <32 x i16> %B to <32 x i32> + %m = mul nsw <32 x i32> %a, %b + %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> + %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> + %ret = add <16 x i32> %odd, %even + store <16 x i32> %ret, <16 x i32>* %CPtr + ret void +} + +define void @psubus_64i8_max_256(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) "required-vector-width"="256" { +; CHECK-LABEL: psubus_64i8_max_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-NEXT: vpsubusb (%rsi), %ymm0, %ymm0 +; CHECK-NEXT: vpsubusb 32(%rsi), %ymm1, %ymm1 +; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %x = load <64 x i8>, <64 x i8>* %xptr + %y = load <64 x i8>, <64 x i8>* %yptr + %cmp = icmp ult <64 x i8> %x, %y + %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x + %res = sub <64 x i8> %max, %y + store <64 x i8> %res, <64 x i8>* %zptr + ret void +} + +define void @psubus_64i8_max_512(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) "required-vector-width"="512" { +; CHECK-LABEL: psubus_64i8_max_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpsubusb (%rsi), %zmm0, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %x = load <64 x i8>, <64 x i8>* %xptr + %y = load <64 x i8>, <64 x i8>* %yptr + %cmp = icmp ult <64 x i8> %x, %y + %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x + %res = sub <64 x i8> %max, %y + store <64 x i8> %res, <64 x i8>* %zptr + ret void +} + +define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly, i32) "required-vector-width"="256" { +; CHECK-LABEL: _Z9test_charPcS_i_256: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB8_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vpmovsxbw (%rdi,%rcx), %xmm4 +; CHECK-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm5 +; CHECK-NEXT: vpmovsxbw 16(%rdi,%rcx), %xmm6 +; CHECK-NEXT: vpmovsxbw 24(%rdi,%rcx), %xmm8 +; CHECK-NEXT: vpmovsxbw (%rsi,%rcx), %xmm7 +; CHECK-NEXT: vpmaddwd %xmm4, %xmm7, %xmm4 +; CHECK-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm7 +; CHECK-NEXT: vpmaddwd %xmm5, %xmm7, %xmm5 +; CHECK-NEXT: vpmovsxbw 16(%rsi,%rcx), %xmm7 +; CHECK-NEXT: vpmaddwd %xmm6, %xmm7, %xmm6 +; CHECK-NEXT: vpmovsxbw 24(%rsi,%rcx), %xmm7 +; CHECK-NEXT: vpmaddwd %xmm8, %xmm7, %xmm7 +; CHECK-NEXT: vpaddd %ymm3, %ymm7, %ymm3 +; CHECK-NEXT: vpaddd %ymm2, %ymm6, %ymm2 +; CHECK-NEXT: vpaddd %ymm1, %ymm5, %ymm1 +; CHECK-NEXT: vpaddd %ymm0, %ymm4, %ymm0 +; CHECK-NEXT: addq $32, %rcx +; CHECK-NEXT: cmpq %rcx, %rax +; CHECK-NEXT: jne .LBB8_1 +; CHECK-NEXT: # %bb.2: # %middle.block +; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; CHECK-NEXT: vpaddd %ymm3, %ymm1, %ymm1 +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %3 = zext i32 %2 to i64 + br label %vector.body + +vector.body: + %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] + %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] + %4 = getelementptr inbounds i8, i8* %0, i64 %index + %5 = bitcast i8* %4 to <32 x i8>* + %wide.load = load <32 x i8>, <32 x i8>* %5, align 1 + %6 = sext <32 x i8> %wide.load to <32 x i32> + %7 = getelementptr inbounds i8, i8* %1, i64 %index + %8 = bitcast i8* %7 to <32 x i8>* + %wide.load14 = load <32 x i8>, <32 x i8>* %8, align 1 + %9 = sext <32 x i8> %wide.load14 to <32 x i32> + %10 = mul nsw <32 x i32> %9, %6 + %11 = add nsw <32 x i32> %10, %vec.phi + %index.next = add i64 %index, 32 + %12 = icmp eq i64 %index.next, %3 + br i1 %12, label %middle.block, label %vector.body + +middle.block: + %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> + %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1 + %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> + %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf + %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> + %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15 + %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> + %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17 + %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> + %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19 + %13 = extractelement <32 x i32> %bin.rdx20, i32 0 + ret i32 %13 +} + +define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly, i32) "required-vector-width"="512" { +; CHECK-LABEL: _Z9test_charPcS_i_512: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB9_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2 +; CHECK-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3 +; CHECK-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2 +; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: addq $32, %rcx +; CHECK-NEXT: cmpq %rcx, %rax +; CHECK-NEXT: jne .LBB9_1 +; CHECK-NEXT: # %bb.2: # %middle.block +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %3 = zext i32 %2 to i64 + br label %vector.body + +vector.body: + %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] + %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] + %4 = getelementptr inbounds i8, i8* %0, i64 %index + %5 = bitcast i8* %4 to <32 x i8>* + %wide.load = load <32 x i8>, <32 x i8>* %5, align 1 + %6 = sext <32 x i8> %wide.load to <32 x i32> + %7 = getelementptr inbounds i8, i8* %1, i64 %index + %8 = bitcast i8* %7 to <32 x i8>* + %wide.load14 = load <32 x i8>, <32 x i8>* %8, align 1 + %9 = sext <32 x i8> %wide.load14 to <32 x i32> + %10 = mul nsw <32 x i32> %9, %6 + %11 = add nsw <32 x i32> %10, %vec.phi + %index.next = add i64 %index, 32 + %12 = icmp eq i64 %index.next, %3 + br i1 %12, label %middle.block, label %vector.body + +middle.block: + %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> + %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1 + %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> + %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf + %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> + %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15 + %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> + %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17 + %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> + %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19 + %13 = extractelement <32 x i32> %bin.rdx20, i32 0 + ret i32 %13 +} + +@a = global [1024 x i8] zeroinitializer, align 16 +@b = global [1024 x i8] zeroinitializer, align 16 + +define i32 @sad_16i8_256() "required-vector-width"="256" { +; CHECK-LABEL: sad_16i8_256: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: movq $-1024, %rax # imm = 0xFC00 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB10_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vmovdqu a+1024(%rax), %xmm2 +; CHECK-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2 +; CHECK-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: jne .LBB10_1 +; CHECK-NEXT: # %bb.2: # %middle.block +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] + %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index + %1 = bitcast i8* %0 to <16 x i8>* + %wide.load = load <16 x i8>, <16 x i8>* %1, align 4 + %2 = zext <16 x i8> %wide.load to <16 x i32> + %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index + %4 = bitcast i8* %3 to <16 x i8>* + %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4 + %5 = zext <16 x i8> %wide.load1 to <16 x i32> + %6 = sub nsw <16 x i32> %2, %5 + %7 = icmp sgt <16 x i32> %6, + %8 = sub nsw <16 x i32> zeroinitializer, %6 + %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8 + %10 = add nsw <16 x i32> %9, %vec.phi + %index.next = add i64 %index, 4 + %11 = icmp eq i64 %index.next, 1024 + br i1 %11, label %middle.block, label %vector.body + +middle.block: + %.lcssa = phi <16 x i32> [ %10, %vector.body ] + %rdx.shuf = shufflevector <16 x i32> %.lcssa, <16 x i32> undef, <16 x i32> + %bin.rdx = add <16 x i32> %.lcssa, %rdx.shuf + %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> + %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2 + %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> + %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3 + %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> + %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4 + %12 = extractelement <16 x i32> %bin.rdx4, i32 0 + ret i32 %12 +} + +define i32 @sad_16i8_512() "required-vector-width"="512" { +; CHECK-LABEL: sad_16i8_512: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: movq $-1024, %rax # imm = 0xFC00 +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB11_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vmovdqu a+1024(%rax), %xmm1 +; CHECK-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1 +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: jne .LBB11_1 +; CHECK-NEXT: # %bb.2: # %middle.block +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] + %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index + %1 = bitcast i8* %0 to <16 x i8>* + %wide.load = load <16 x i8>, <16 x i8>* %1, align 4 + %2 = zext <16 x i8> %wide.load to <16 x i32> + %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index + %4 = bitcast i8* %3 to <16 x i8>* + %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4 + %5 = zext <16 x i8> %wide.load1 to <16 x i32> + %6 = sub nsw <16 x i32> %2, %5 + %7 = icmp sgt <16 x i32> %6, + %8 = sub nsw <16 x i32> zeroinitializer, %6 + %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8 + %10 = add nsw <16 x i32> %9, %vec.phi + %index.next = add i64 %index, 4 + %11 = icmp eq i64 %index.next, 1024 + br i1 %11, label %middle.block, label %vector.body + +middle.block: + %.lcssa = phi <16 x i32> [ %10, %vector.body ] + %rdx.shuf = shufflevector <16 x i32> %.lcssa, <16 x i32> undef, <16 x i32> + %bin.rdx = add <16 x i32> %.lcssa, %rdx.shuf + %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> + %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2 + %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> + %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3 + %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> + %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4 + %12 = extractelement <16 x i32> %bin.rdx4, i32 0 + ret i32 %12 +}