diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18662,10 +18662,23 @@ // Allow targets to opt-out. EVT VT = Extract->getValueType(0); - // Only handle byte sized scalars otherwise the offset is incorrect. - // FIXME: We might be able to do better if the VT is byte sized and the index - // is aligned. - if (!VT.getScalarType().isByteSized()) + // We can only create byte sized loads. + if (!VT.isByteSized()) + return SDValue(); + + unsigned Index = ExtIdx->getZExtValue(); + unsigned NumElts = VT.getVectorNumElements(); + + // If the index is a multiple of the extract element count, we can offset the + // address by the store size multiplied by the subvector index. Otherwise if + // the scalar type is byte sized, we can just use the index multiplied by + // the element size in bytes as the offset. + unsigned Offset; + if (Index % NumElts == 0) + Offset = (Index / NumElts) * VT.getStoreSize(); + else if (VT.getScalarType().isByteSized()) + Offset = Index * VT.getScalarType().getStoreSize(); + else return SDValue(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -18675,8 +18688,7 @@ // The narrow load will be offset from the base address of the old load if // we are extracting from something besides index 0 (little-endian). SDLoc DL(Extract); - SDValue BaseAddr = Ld->getOperand(1); - unsigned Offset = ExtIdx->getZExtValue() * VT.getScalarType().getStoreSize(); + SDValue BaseAddr = Ld->getBasePtr(); // TODO: Use "BaseIndexOffset" to make this more effective. SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL); diff --git a/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll b/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll --- a/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll +++ b/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll @@ -237,8 +237,7 @@ define void @load_v32i1_broadcast_16_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) { ; AVX512-LABEL: load_v32i1_broadcast_16_v8i1: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovd (%rdi), %k0 -; AVX512-NEXT: kshiftrd $16, %k0, %k0 +; AVX512-NEXT: kmovb 2(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %ymm2 ; AVX512-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512-NEXT: vpmovd2m %ymm2, %k1 @@ -249,8 +248,7 @@ ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v8i1: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1 +; AVX512NOTDQ-NEXT: kmovw 2(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %ymm2 @@ -326,8 +324,7 @@ define void @load_v32i1_broadcast_31_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) { ; AVX512-LABEL: load_v32i1_broadcast_31_v8i1: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovd (%rdi), %k0 -; AVX512-NEXT: kshiftrd $24, %k0, %k0 +; AVX512-NEXT: kmovb 3(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %ymm2 ; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] ; AVX512-NEXT: vpermd %ymm2, %ymm3, %ymm2 @@ -339,8 +336,8 @@ ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v8i1: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrd $24, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 3(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] @@ -417,8 +414,7 @@ define void @load_v64i1_broadcast_32_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) { ; AVX512-LABEL: load_v64i1_broadcast_32_v8i1: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovq (%rdi), %k0 -; AVX512-NEXT: kshiftrq $32, %k0, %k0 +; AVX512-NEXT: kmovb 4(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %ymm2 ; AVX512-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512-NEXT: vpmovd2m %ymm2, %k1 @@ -429,8 +425,7 @@ ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v8i1: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1 +; AVX512NOTDQ-NEXT: kmovw 4(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %ymm2 @@ -448,8 +443,7 @@ define void @load_v64i1_broadcast_32_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x float> %a2,<16 x float>* %a3) { ; AVX512-LABEL: load_v64i1_broadcast_32_v16i1: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovq (%rdi), %k0 -; AVX512-NEXT: kshiftrq $32, %k0, %k0 +; AVX512-NEXT: kmovw 4(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %zmm2 ; AVX512-NEXT: vpbroadcastd %xmm2, %zmm2 ; AVX512-NEXT: vpmovd2m %zmm2, %k1 @@ -460,8 +454,7 @@ ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1 +; AVX512NOTDQ-NEXT: kmovw 4(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %zmm2 ; AVX512NOTDQ-NEXT: vptestmd %zmm2, %zmm2, %k1 @@ -536,8 +529,7 @@ define void @load_v64i1_broadcast_63_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) { ; AVX512-LABEL: load_v64i1_broadcast_63_v8i1: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovq (%rdi), %k0 -; AVX512-NEXT: kshiftrq $56, %k0, %k0 +; AVX512-NEXT: kmovb 7(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %ymm2 ; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] ; AVX512-NEXT: vpermd %ymm2, %ymm3, %ymm2 @@ -549,8 +541,8 @@ ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v8i1: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrq $56, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 7(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] @@ -569,8 +561,7 @@ define void @load_v64i1_broadcast_63_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x float> %a2,<16 x float>* %a3) { ; AVX512-LABEL: load_v64i1_broadcast_63_v16i1: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovq (%rdi), %k0 -; AVX512-NEXT: kshiftrq $48, %k0, %k0 +; AVX512-NEXT: kmovw 6(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %zmm2 ; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512-NEXT: vpermd %zmm2, %zmm3, %zmm2 @@ -582,8 +573,7 @@ ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v16i1: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrq $48, %k0, %k1 +; AVX512NOTDQ-NEXT: kmovw 6(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512NOTDQ-NEXT: vpermd %zmm2, %zmm3, %zmm2 @@ -1037,8 +1027,7 @@ define void @load_v32i1_broadcast_16_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) { ; AVX512-LABEL: load_v32i1_broadcast_16_v8i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovd (%rdi), %k0 -; AVX512-NEXT: kshiftrd $16, %k0, %k0 +; AVX512-NEXT: kmovb 2(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %ymm0 ; AVX512-NEXT: vpbroadcastd %xmm0, %ymm0 ; AVX512-NEXT: vpmovd2m %ymm0, %k0 @@ -1048,8 +1037,7 @@ ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v8i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1 +; AVX512NOTDQ-NEXT: kmovw 2(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %ymm0 @@ -1140,8 +1128,7 @@ define void @load_v32i1_broadcast_31_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) { ; AVX512-LABEL: load_v32i1_broadcast_31_v8i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovd (%rdi), %k0 -; AVX512-NEXT: kshiftrd $24, %k0, %k0 +; AVX512-NEXT: kmovb 3(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %ymm0 ; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] ; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0 @@ -1152,8 +1139,8 @@ ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v8i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrd $24, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 3(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] @@ -1245,8 +1232,7 @@ define void @load_v64i1_broadcast_32_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) { ; AVX512-LABEL: load_v64i1_broadcast_32_v8i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovq (%rdi), %k0 -; AVX512-NEXT: kshiftrq $32, %k0, %k0 +; AVX512-NEXT: kmovb 4(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %ymm0 ; AVX512-NEXT: vpbroadcastd %xmm0, %ymm0 ; AVX512-NEXT: vpmovd2m %ymm0, %k0 @@ -1256,8 +1242,7 @@ ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v8i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1 +; AVX512NOTDQ-NEXT: kmovw 4(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %ymm0 @@ -1274,8 +1259,7 @@ define void @load_v64i1_broadcast_32_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) { ; AVX512-LABEL: load_v64i1_broadcast_32_v16i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovq (%rdi), %k0 -; AVX512-NEXT: kshiftrq $32, %k0, %k0 +; AVX512-NEXT: kmovw 4(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %zmm0 ; AVX512-NEXT: vpbroadcastd %xmm0, %zmm0 ; AVX512-NEXT: vpmovd2m %zmm0, %k0 @@ -1285,8 +1269,7 @@ ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1 +; AVX512NOTDQ-NEXT: kmovw 4(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %zmm0 ; AVX512NOTDQ-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -1375,8 +1358,7 @@ define void @load_v64i1_broadcast_63_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) { ; AVX512-LABEL: load_v64i1_broadcast_63_v8i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovq (%rdi), %k0 -; AVX512-NEXT: kshiftrq $56, %k0, %k0 +; AVX512-NEXT: kmovb 7(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %ymm0 ; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] ; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0 @@ -1387,8 +1369,8 @@ ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v8i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrq $56, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 7(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] @@ -1406,8 +1388,7 @@ define void @load_v64i1_broadcast_63_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) { ; AVX512-LABEL: load_v64i1_broadcast_63_v16i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovq (%rdi), %k0 -; AVX512-NEXT: kshiftrq $48, %k0, %k0 +; AVX512-NEXT: kmovw 6(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %zmm0 ; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm0 @@ -1418,8 +1399,7 @@ ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v16i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrq $48, %k0, %k1 +; AVX512NOTDQ-NEXT: kmovw 6(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512NOTDQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -278,10 +278,7 @@ ; ; X86-LABEL: shuf_test1: ; X86: ## %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k0 -; X86-NEXT: kshiftrw $8, %k0, %k0 -; X86-NEXT: kmovd %k0, %eax -; X86-NEXT: ## kill: def $al killed $al killed $eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: retl %v1 = bitcast i16 %v to <16 x i1> %mask = shufflevector <16 x i1> %v1, <16 x i1> undef, <8 x i32> diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -7,8 +7,8 @@ define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) { ; X86-LABEL: test_int_x86_avx512_kunpck_wd: ; X86: # %bb.0: -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k0 # encoding: [0xc4,0xe1,0xf9,0x90,0x44,0x24,0x04] -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k0 # encoding: [0xc5,0xf8,0x90,0x44,0x24,0x04] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] ; X86-NEXT: kunpckwd %k1, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: retl # encoding: [0xc3] @@ -29,10 +29,8 @@ define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) { ; X86-LABEL: test_int_x86_avx512_kunpck_qd: ; X86: # %bb.0: -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k0 # encoding: [0xc4,0xe1,0xf8,0x90,0x44,0x24,0x04] -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x0c] -; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X86-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04] +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_kunpck_qd: