Index: lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- lib/Target/X86/X86ISelDAGToDAG.cpp +++ lib/Target/X86/X86ISelDAGToDAG.cpp @@ -194,6 +194,7 @@ bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM); bool matchWrapper(SDValue N, X86ISelAddressMode &AM); bool matchAddress(SDValue N, X86ISelAddressMode &AM); + bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM); bool matchAdd(SDValue N, X86ISelAddressMode &AM, unsigned Depth); bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, unsigned Depth); @@ -1502,22 +1503,34 @@ return false; } +/// Helper for selectVectorAddr. Handles things that can be folded into a +/// gather scatter address. The index register and scale should have already +/// been handled. +bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) { + // TODO: Support other operations. + switch (N.getOpcode()) { + case X86ISD::Wrapper: + if (!matchWrapper(N, AM)) + return false; + break; + } + + return matchAddressBase(N, AM); +} + bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { - unsigned ScalarSize; + X86ISelAddressMode AM; if (auto Mgs = dyn_cast(Parent)) { - Base = Mgs->getBasePtr(); - Index = Mgs->getIndex(); - ScalarSize = Mgs->getValue().getScalarValueSizeInBits(); + AM.IndexReg = Mgs->getIndex(); + AM.Scale = Mgs->getValue().getScalarValueSizeInBits() / 8; } else { auto X86Gather = cast(Parent); - Base = X86Gather->getBasePtr(); - Index = X86Gather->getIndex(); - ScalarSize = X86Gather->getValue().getScalarValueSizeInBits(); + AM.IndexReg = X86Gather->getIndex(); + AM.Scale = X86Gather->getValue().getScalarValueSizeInBits() / 8; } - X86ISelAddressMode AM; unsigned AddrSpace = cast(Parent)->getPointerInfo().getAddrSpace(); // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS. if (AddrSpace == 256) @@ -1527,21 +1540,23 @@ if (AddrSpace == 258) AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16); - SDLoc DL(N); - Scale = getI8Imm(ScalarSize/8, DL); - // If Base is 0, the whole address is in index and the Scale is 1 - if (isa(Base)) { - assert(cast(Base)->isNullValue() && + if (isa(N)) { + assert(cast(N)->isNullValue() && "Unexpected base in gather/scatter"); - Scale = getI8Imm(1, DL); - Base = CurDAG->getRegister(0, MVT::i32); + AM.Scale = 1; } - if (AM.Segment.getNode()) - Segment = AM.Segment; - else - Segment = CurDAG->getRegister(0, MVT::i32); - Disp = CurDAG->getTargetConstant(0, DL, MVT::i32); + // Otherwise, try to match into the base and displacement fields. + else if (matchVectorAddress(N, AM)) + return false; + + MVT VT = N.getSimpleValueType(); + if (AM.BaseType == X86ISelAddressMode::RegBase) { + if (!AM.Base_Reg.getNode()) + AM.Base_Reg = CurDAG->getRegister(0, VT); + } + + getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment); return true; } Index: test/CodeGen/X86/masked_gather_scatter.ll =================================================================== --- test/CodeGen/X86/masked_gather_scatter.ll +++ test/CodeGen/X86/masked_gather_scatter.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_64 ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_32 -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX --check-prefix=SKX_SMALL +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq -code-model=large < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX --check-prefix=SKX_LARGE ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX_32 ; RUN: opt -mtriple=x86_64-apple-darwin -scalarize-masked-mem-intrin -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR ; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -mcpu=skx < %s -o /dev/null @@ -491,18 +492,34 @@ ; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} ; KNL_32-NEXT: retl ; -; SKX-LABEL: test9: -; SKX: # BB#0: # %entry -; SKX-NEXT: vpbroadcastq %rdi, %zmm2 -; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; SKX-NEXT: vpmovsxdq %ymm1, %zmm1 -; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1 -; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0 -; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 -; SKX-NEXT: kxnorw %k0, %k0, %k1 -; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} -; SKX-NEXT: retq +; SKX_SMALL-LABEL: test9: +; SKX_SMALL: # BB#0: # %entry +; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2 +; SKX_SMALL-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; SKX_SMALL-NEXT: vpmovsxdq %ymm1, %zmm1 +; SKX_SMALL-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; SKX_SMALL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 +; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1 +; SKX_SMALL-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} +; SKX_SMALL-NEXT: retq +; +; SKX_LARGE-LABEL: test9: +; SKX_LARGE: # BB#0: # %entry +; SKX_LARGE-NEXT: vpbroadcastq %rdi, %zmm2 +; SKX_LARGE-NEXT: vpmovsxdq %ymm1, %zmm1 +; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax +; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm1, %zmm1 +; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax +; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0 +; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax +; SKX_LARGE-NEXT: vpaddq (%rax){1to8}, %zmm0, %zmm1 +; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1 +; SKX_LARGE-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} +; SKX_LARGE-NEXT: retq ; ; SKX_32-LABEL: test9: ; SKX_32: # BB#0: # %entry @@ -560,18 +577,34 @@ ; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} ; KNL_32-NEXT: retl ; -; SKX-LABEL: test10: -; SKX: # BB#0: # %entry -; SKX-NEXT: vpbroadcastq %rdi, %zmm2 -; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; SKX-NEXT: vpmovsxdq %ymm1, %zmm1 -; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1 -; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0 -; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 -; SKX-NEXT: kxnorw %k0, %k0, %k1 -; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} -; SKX-NEXT: retq +; SKX_SMALL-LABEL: test10: +; SKX_SMALL: # BB#0: # %entry +; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2 +; SKX_SMALL-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; SKX_SMALL-NEXT: vpmovsxdq %ymm1, %zmm1 +; SKX_SMALL-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; SKX_SMALL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 +; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1 +; SKX_SMALL-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} +; SKX_SMALL-NEXT: retq +; +; SKX_LARGE-LABEL: test10: +; SKX_LARGE: # BB#0: # %entry +; SKX_LARGE-NEXT: vpbroadcastq %rdi, %zmm2 +; SKX_LARGE-NEXT: vpmovsxdq %ymm1, %zmm1 +; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax +; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm1, %zmm1 +; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax +; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0 +; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax +; SKX_LARGE-NEXT: vpaddq (%rax){1to8}, %zmm0, %zmm1 +; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1 +; SKX_LARGE-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} +; SKX_LARGE-NEXT: retq ; ; SKX_32-LABEL: test10: ; SKX_32: # BB#0: # %entry @@ -2330,33 +2363,37 @@ define <8 x i32> @test_global_array(<8 x i64> %indxs) { ; KNL_64-LABEL: test_global_array: ; KNL_64: # BB#0: -; KNL_64-NEXT: movl $glob_array, %eax ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 -; KNL_64-NEXT: vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1} +; KNL_64-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} ; KNL_64-NEXT: vmovdqa %ymm1, %ymm0 ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test_global_array: ; KNL_32: # BB#0: -; KNL_32-NEXT: movl $glob_array, %eax ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 -; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k1} +; KNL_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} ; KNL_32-NEXT: vmovdqa %ymm1, %ymm0 ; KNL_32-NEXT: retl ; -; SKX-LABEL: test_global_array: -; SKX: # BB#0: -; SKX-NEXT: movl $glob_array, %eax -; SKX-NEXT: kxnorw %k0, %k0, %k1 -; SKX-NEXT: vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1} -; SKX-NEXT: vmovdqa %ymm1, %ymm0 -; SKX-NEXT: retq +; SKX_SMALL-LABEL: test_global_array: +; SKX_SMALL: # BB#0: +; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1 +; SKX_SMALL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} +; SKX_SMALL-NEXT: vmovdqa %ymm1, %ymm0 +; SKX_SMALL-NEXT: retq +; +; SKX_LARGE-LABEL: test_global_array: +; SKX_LARGE: # BB#0: +; SKX_LARGE-NEXT: movabsq $glob_array, %rax +; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1 +; SKX_LARGE-NEXT: vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1} +; SKX_LARGE-NEXT: vmovdqa %ymm1, %ymm0 +; SKX_LARGE-NEXT: retq ; ; SKX_32-LABEL: test_global_array: ; SKX_32: # BB#0: -; SKX_32-NEXT: movl $glob_array, %eax ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 -; SKX_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k1} +; SKX_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} ; SKX_32-NEXT: vmovdqa %ymm1, %ymm0 ; SKX_32-NEXT: retl %p = getelementptr inbounds [16 x i32], [16 x i32]* @glob_array, i64 0, <8 x i64> %indxs