diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -23541,6 +23541,50 @@ InnerShuf->getOperand(1), CombinedMask); } +/// Given a vectorized load used in a splat, scalarize the load to only load the +/// element required for splatting. +static SDValue scalarizeLoadIntoSplat(ShuffleVectorSDNode *Shuf, + SelectionDAG &DAG) { + if (!Shuf->isSplat() || count(Shuf->getMask(), Shuf->getSplatIndex()) <= 1) + return SDValue(); + + EVT VecVT = Shuf->getValueType(0); + unsigned SplatIdx = Shuf->getSplatIndex(); + SDValue SplattedOp; + if (SplatIdx < VecVT.getVectorNumElements()) + SplattedOp = Shuf->getOperand(0); + else + SplattedOp = Shuf->getOperand(1); + + LoadSDNode *Load = dyn_cast(Shuf->getOperand(0).getNode()); + if (!Load) + return SDValue(); + + if (!(Load->isSimple() && Load->hasOneUse() && VecVT.isVector())) + return SDValue(); + + auto &TLI = DAG.getTargetLoweringInfo(); + SDValue SplatIdxV = + DAG.getConstant(SplatIdx, SDLoc(Shuf), MVT::i32); + SDValue NewPtr = + TLI.getVectorElementPointer(DAG, Load->getBasePtr(), VecVT, SplatIdxV); + + EVT VecEltVT = VecVT.getVectorElementType(); + if (!TLI.isTypeLegal(VecEltVT)) + return SDValue(); + + unsigned PtrOff = VecEltVT.getSizeInBits() * SplatIdx / 8; + MachinePointerInfo MPI = Load->getPointerInfo().getWithOffset(PtrOff); + Align Alignment = commonAlignment(Load->getAlign(), PtrOff); + + auto NewLoad = DAG.getLoad(VecEltVT, SDLoc(Load), Load->getChain(), NewPtr, + MPI, Alignment, Load->getMemOperand()->getFlags(), + Load->getAAInfo()); + DAG.makeEquivalentMemoryOrdering(Load, NewLoad); + + return DAG.getSplatBuildVector(Shuf->getValueType(0), SDLoc(Shuf), NewLoad); +} + /// If the shuffle mask is taking exactly one element from the first vector /// operand and passing through all other elements from the second vector /// operand, return the index of the mask element that is choosing an element @@ -23696,6 +23740,9 @@ if (SDValue V = formSplatFromShuffles(SVN, DAG)) return V; + if (SDValue V = scalarizeLoadIntoSplat(SVN, DAG)) + return V; + // If it is a splat, check if the argument vector is another splat or a // build_vector. if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) { diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll @@ -716,8 +716,7 @@ ; CHECK-LABEL: load_splat_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: mov z0.s, s0 +; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0] ; CHECK-NEXT: st1w { z0.s }, p0, [x8] ; CHECK-NEXT: ret %v = load <8 x float>, ptr %p @@ -729,8 +728,7 @@ ; CHECK-LABEL: load_splat_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: mov z0.d, d0 +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] ; CHECK-NEXT: st1d { z0.d }, p0, [x8] ; CHECK-NEXT: ret %v = load <4 x double>, ptr %p @@ -742,8 +740,7 @@ ; CHECK-LABEL: load_splat_v32i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] -; CHECK-NEXT: mov z0.b, b0 +; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x0] ; CHECK-NEXT: st1b { z0.b }, p0, [x8] ; CHECK-NEXT: ret %v = load <32 x i8>, ptr %p @@ -755,8 +752,7 @@ ; CHECK-LABEL: load_splat_v16i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: mov z0.h, h0 +; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0] ; CHECK-NEXT: st1h { z0.h }, p0, [x8] ; CHECK-NEXT: ret %v = load <16 x i16>, ptr %p @@ -768,8 +764,7 @@ ; CHECK-LABEL: load_splat_v8i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: mov z0.s, s0 +; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0] ; CHECK-NEXT: st1w { z0.s }, p0, [x8] ; CHECK-NEXT: ret %v = load <8 x i32>, ptr %p @@ -781,8 +776,7 @@ ; CHECK-LABEL: load_splat_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: mov z0.d, d0 +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] ; CHECK-NEXT: st1d { z0.d }, p0, [x8] ; CHECK-NEXT: ret %v = load <4 x i64>, ptr %p diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -1168,7 +1168,7 @@ ; GFX9-LABEL: shuffle_v4f16_0000: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_ushort v0, v[0:1], off ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 @@ -1179,7 +1179,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040100 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -1189,7 +1189,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: global_load_u16 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x5040100 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) diff --git a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll --- a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll +++ b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll @@ -1127,16 +1127,11 @@ ; ; CHECK-NOVSX-LABEL: testSplati64_1: ; CHECK-NOVSX: # %bb.0: # %entry -; CHECK-NOVSX-NEXT: ld r4, 8(r3) -; CHECK-NOVSX-NEXT: std r4, -8(r1) -; CHECK-NOVSX-NEXT: addis r4, r2, .LCPI21_0@toc@ha -; CHECK-NOVSX-NEXT: ld r3, 0(r3) -; CHECK-NOVSX-NEXT: addi r4, r4, .LCPI21_0@toc@l -; CHECK-NOVSX-NEXT: lvx v2, 0, r4 +; CHECK-NOVSX-NEXT: ld r3, 8(r3) +; CHECK-NOVSX-NEXT: addi r4, r1, -16 +; CHECK-NOVSX-NEXT: std r3, -8(r1) ; CHECK-NOVSX-NEXT: std r3, -16(r1) -; CHECK-NOVSX-NEXT: addi r3, r1, -16 -; CHECK-NOVSX-NEXT: lvx v3, 0, r3 -; CHECK-NOVSX-NEXT: vperm v2, v3, v3, v2 +; CHECK-NOVSX-NEXT: lvx v2, 0, r4 ; CHECK-NOVSX-NEXT: blr ; ; CHECK-P7-LABEL: testSplati64_1: diff --git a/llvm/test/CodeGen/WebAssembly/simd-vectorized-load-splat.ll b/llvm/test/CodeGen/WebAssembly/simd-vectorized-load-splat.ll --- a/llvm/test/CodeGen/WebAssembly/simd-vectorized-load-splat.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-vectorized-load-splat.ll @@ -9,12 +9,9 @@ define <4 x i32> @load_splat_shuhffle_lhs(ptr %p) { ; CHECK-LABEL: load_splat_shuhffle_lhs: ; CHECK: .functype load_splat_shuhffle_lhs (i32) -> (v128) -; CHECK-NEXT: .local v128 ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.load 0 -; CHECK-NEXT: local.get 1 -; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 +; CHECK-NEXT: v128.load64_splat 0 ; CHECK-NEXT: # fallthrough-return %a = load <2 x i64>, ptr %p %b = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> @@ -26,12 +23,11 @@ define <4 x i32> @load_splat_shuffle_lhs_with_offset(ptr %p) { ; CHECK-LABEL: load_splat_shuffle_lhs_with_offset: ; CHECK: .functype load_splat_shuffle_lhs_with_offset (i32) -> (v128) -; CHECK-NEXT: .local v128 ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.load 0 -; CHECK-NEXT: local.get 1 -; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15 +; CHECK-NEXT: i32.const 8 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load64_splat 0 ; CHECK-NEXT: # fallthrough-return %a = load <2 x i64>, ptr %p %b = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> @@ -43,12 +39,9 @@ define <4 x i32> @load_splat_shuffle_rhs(ptr %p) { ; CHECK-LABEL: load_splat_shuffle_rhs: ; CHECK: .functype load_splat_shuffle_rhs (i32) -> (v128) -; CHECK-NEXT: .local v128 ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.load 0 -; CHECK-NEXT: local.get 1 -; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 +; CHECK-NEXT: v128.load64_splat 0 ; CHECK-NEXT: # fallthrough-return %a = load <2 x i64>, ptr %p %b = shufflevector <2 x i64> poison, <2 x i64> %a, <2 x i32> @@ -60,12 +53,11 @@ define <4 x i32> @load_splat_shuffle_rhs_with_offset(ptr %p) { ; CHECK-LABEL: load_splat_shuffle_rhs_with_offset: ; CHECK: .functype load_splat_shuffle_rhs_with_offset (i32) -> (v128) -; CHECK-NEXT: .local v128 ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.load 0 -; CHECK-NEXT: local.get 1 -; CHECK-NEXT: i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15 +; CHECK-NEXT: i32.const 8 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load64_splat 0 ; CHECK-NEXT: # fallthrough-return %a = load <2 x i64>, ptr %p %b = shufflevector <2 x i64> poison, <2 x i64> %a, <2 x i32> diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll --- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll @@ -364,18 +364,18 @@ ret <4 x i32> %vecinit6.i } -; FIXME: Pointer adjusted broadcasts +; Pointer adjusted broadcasts define <4 x i32> @load_splat_4i32_4i32_1111(ptr %ptr) nounwind uwtable readnone ssp { ; X86-LABEL: load_splat_4i32_4i32_1111: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,1,1,1] +; X86-NEXT: vbroadcastss 4(%eax), %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: load_splat_4i32_4i32_1111: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,1,1,1] +; X64-NEXT: vbroadcastss 4(%rdi), %xmm0 ; X64-NEXT: retq entry: %ld = load <4 x i32>, ptr %ptr @@ -477,7 +477,7 @@ ; ; X64-LABEL: load_splat_2i64_2i64_1111: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] +; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; X64-NEXT: retq entry: %ld = load <2 x i64>, ptr %ptr diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -1332,23 +1332,24 @@ define <8 x half> @shuffle(ptr %p) { ; CHECK-LIBCALL-LABEL: shuffle: ; CHECK-LIBCALL: # %bb.0: -; CHECK-LIBCALL-NEXT: movdqu (%rdi), %xmm0 -; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-LIBCALL-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; CHECK-LIBCALL-NEXT: pinsrw $0, 8(%rdi), %xmm0 +; CHECK-LIBCALL-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; CHECK-LIBCALL-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; CHECK-LIBCALL-NEXT: retq ; ; BWON-F16C-LABEL: shuffle: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,4,4,4,4] -; BWON-F16C-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; BWON-F16C-NEXT: vpinsrw $0, 8(%rdi), %xmm0, %xmm0 +; BWON-F16C-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; BWON-F16C-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: shuffle: ; CHECK-I686: # %bb.0: ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-I686-NEXT: movdqu (%eax), %xmm0 -; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-I686-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; CHECK-I686-NEXT: pinsrw $0, 8(%eax), %xmm0 +; CHECK-I686-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; CHECK-I686-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; CHECK-I686-NEXT: retl %1 = load <8 x half>, ptr %p, align 8 %2 = shufflevector <8 x half> %1, <8 x half> poison, <8 x i32> diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll --- a/llvm/test/CodeGen/X86/sse41.ll +++ b/llvm/test/CodeGen/X86/sse41.ll @@ -1587,9 +1587,8 @@ ; X86-SSE-LABEL: insertps_from_broadcast_loadv4f32: ; X86-SSE: ## %bb.0: ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-SSE-NEXT: movups (%eax), %xmm1 ## encoding: [0x0f,0x10,0x08] -; X86-SSE-NEXT: insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30] -; X86-SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] +; X86-SSE-NEXT: insertps $48, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x00,0x30] +; X86-SSE-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] ; X86-SSE-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX1-LABEL: insertps_from_broadcast_loadv4f32: @@ -1608,9 +1607,8 @@ ; ; X64-SSE-LABEL: insertps_from_broadcast_loadv4f32: ; X64-SSE: ## %bb.0: -; X64-SSE-NEXT: movups (%rdi), %xmm1 ## encoding: [0x0f,0x10,0x0f] -; X64-SSE-NEXT: insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30] -; X64-SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] +; X64-SSE-NEXT: insertps $48, (%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x07,0x30] +; X64-SSE-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] ; X64-SSE-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX1-LABEL: insertps_from_broadcast_loadv4f32: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -1222,7 +1222,7 @@ define <2 x double> @insert_dup_mem128_v2f64(ptr %ptr) nounwind { ; SSE2-LABEL: insert_dup_mem128_v2f64: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/widened-broadcast.ll b/llvm/test/CodeGen/X86/widened-broadcast.ll --- a/llvm/test/CodeGen/X86/widened-broadcast.ll +++ b/llvm/test/CodeGen/X86/widened-broadcast.ll @@ -468,7 +468,7 @@ define <4 x float> @load_splat_4f32_8f32_0000(ptr %ptr) nounwind uwtable readnone ssp { ; SSE-LABEL: load_splat_4f32_8f32_0000: ; SSE: # %bb.0: # %entry -; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq ;