diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -22834,6 +22834,46 @@ InnerShuf->getOperand(1), CombinedMask); } +/// Given a vectorized load used in a splat, scalarize the load to only load the +/// element required for splatting. +static SDValue scalarizeLoadIntoSplat(ShuffleVectorSDNode *Shuf, + SelectionDAG &DAG) { + if (!Shuf->isSplat()) + return SDValue(); + + EVT VecVT = Shuf->getOperand(0).getValueType(); + SDValue SplattedOp; + if ((unsigned)Shuf->getSplatIndex() >= VecVT.getVectorNumElements()) + SplattedOp = Shuf->getOperand(0); + else + SplattedOp = Shuf->getOperand(1); + + LoadSDNode *Load = dyn_cast(Shuf->getOperand(0).getNode()); + if (!Load) + return SDValue(); + + if (!(Load->isSimple() && Load->hasOneUse() && VecVT.isVector())) + return SDValue(); + + auto &TLI = DAG.getTargetLoweringInfo(); + SDValue SplatIdx = + DAG.getConstant(Shuf->getSplatIndex(), SDLoc(Shuf), MVT::i32); + SDValue NewPtr = + TLI.getVectorElementPointer(DAG, Load->getBasePtr(), VecVT, SplatIdx); + + EVT VecEltVT = VecVT.getVectorElementType(); + unsigned PtrOff = VecEltVT.getSizeInBits() * Shuf->getSplatIndex() / 8; + MachinePointerInfo MPI = Load->getPointerInfo().getWithOffset(PtrOff); + Align Alignment = commonAlignment(Load->getAlign(), PtrOff); + + auto NewLoad = DAG.getLoad(VecEltVT, SDLoc(Load), Load->getChain(), NewPtr, + MPI, Alignment, Load->getMemOperand()->getFlags(), + Load->getAAInfo()); + DAG.makeEquivalentMemoryOrdering(Load, NewLoad); + + return DAG.getSplatBuildVector(Shuf->getValueType(0), SDLoc(Shuf), NewLoad); +} + /// If the shuffle mask is taking exactly one element from the first vector /// operand and passing through all other elements from the second vector /// operand, return the index of the mask element that is choosing an element @@ -22989,6 +23029,9 @@ if (SDValue V = formSplatFromShuffles(SVN, DAG)) return V; + if (SDValue V = scalarizeLoadIntoSplat(SVN, DAG)) + return V; + // If it is a splat, check if the argument vector is another splat or a // build_vector. if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) { diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll @@ -716,8 +716,7 @@ ; CHECK-LABEL: load_splat_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: mov z0.s, s0 +; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0] ; CHECK-NEXT: st1w { z0.s }, p0, [x8] ; CHECK-NEXT: ret %v = load <8 x float>, ptr %p @@ -729,8 +728,7 @@ ; CHECK-LABEL: load_splat_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: mov z0.d, d0 +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] ; CHECK-NEXT: st1d { z0.d }, p0, [x8] ; CHECK-NEXT: ret %v = load <4 x double>, ptr %p @@ -742,8 +740,7 @@ ; CHECK-LABEL: load_splat_v32i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] -; CHECK-NEXT: mov z0.b, b0 +; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x0] ; CHECK-NEXT: st1b { z0.b }, p0, [x8] ; CHECK-NEXT: ret %v = load <32 x i8>, ptr %p @@ -755,8 +752,7 @@ ; CHECK-LABEL: load_splat_v16i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: mov z0.h, h0 +; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0] ; CHECK-NEXT: st1h { z0.h }, p0, [x8] ; CHECK-NEXT: ret %v = load <16 x i16>, ptr %p @@ -768,8 +764,7 @@ ; CHECK-LABEL: load_splat_v8i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: mov z0.s, s0 +; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0] ; CHECK-NEXT: st1w { z0.s }, p0, [x8] ; CHECK-NEXT: ret %v = load <8 x i32>, ptr %p @@ -781,8 +776,7 @@ ; CHECK-LABEL: load_splat_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: mov z0.d, d0 +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] ; CHECK-NEXT: st1d { z0.d }, p0, [x8] ; CHECK-NEXT: ret %v = load <4 x i64>, ptr %p diff --git a/llvm/test/CodeGen/WebAssembly/simd-vectorized-load-splat.ll b/llvm/test/CodeGen/WebAssembly/simd-vectorized-load-splat.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/simd-vectorized-load-splat.ll @@ -0,0 +1,67 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s + +; Ensures that vectorized loads that are really just splatted loads, are indeed +; selected as splatted loads + +target triple = "wasm32-unknown-unknown" + +define <4 x i32> @load_splat_shuhffle_lhs(ptr %p) { +; CHECK-LABEL: load_splat_shuhffle_lhs: +; CHECK: .functype load_splat_shuhffle_lhs (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.load64_splat 0 +; CHECK-NEXT: # fallthrough-return + %a = load <2 x i64>, ptr %p + %b = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> + %c = bitcast <2 x i64> %b to <4 x i32> + %d = shufflevector <4 x i32> %c, <4 x i32> poison, <4 x i32> + ret <4 x i32> %d +} + +define <4 x i32> @load_splat_shuffle_lhs_with_offset(ptr %p) { +; CHECK-LABEL: load_splat_shuffle_lhs_with_offset: +; CHECK: .functype load_splat_shuffle_lhs_with_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const 8 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load64_splat 0 +; CHECK-NEXT: # fallthrough-return + %a = load <2 x i64>, ptr %p + %b = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> + %c = bitcast <2 x i64> %b to <4 x i32> + %d = shufflevector <4 x i32> %c, <4 x i32> poison, <4 x i32> + ret <4 x i32> %d +} + +define <4 x i32> @load_splat_shuffle_rhs(ptr %p) { +; CHECK-LABEL: load_splat_shuffle_rhs: +; CHECK: .functype load_splat_shuffle_rhs (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.load64_splat 0 +; CHECK-NEXT: # fallthrough-return + %a = load <2 x i64>, ptr %p + %b = shufflevector <2 x i64> poison, <2 x i64> %a, <2 x i32> + %c = bitcast <2 x i64> %b to <4 x i32> + %d = shufflevector <4 x i32> %c, <4 x i32> poison, <4 x i32> + ret <4 x i32> %d +} + +define <4 x i32> @load_splat_shuffle_rhs_with_offset(ptr %p) { +; CHECK-LABEL: load_splat_shuffle_rhs_with_offset: +; CHECK: .functype load_splat_shuffle_rhs_with_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const 8 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load64_splat 0 +; CHECK-NEXT: # fallthrough-return + %a = load <2 x i64>, ptr %p + %b = shufflevector <2 x i64> poison, <2 x i64> %a, <2 x i32> + %c = bitcast <2 x i64> %b to <4 x i32> + %d = shufflevector <4 x i32> %c, <4 x i32> poison, <4 x i32> + ret <4 x i32> %d +} diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll --- a/llvm/test/CodeGen/X86/extractelement-load.ll +++ b/llvm/test/CodeGen/X86/extractelement-load.ll @@ -84,7 +84,7 @@ ; X32-SSE2-LABEL: t4: ; X32-SSE2: # %bb.0: ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movdqa (%eax), %xmm0 +; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X32-SSE2-NEXT: movd %xmm0, %eax ; X32-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X32-SSE2-NEXT: movd %xmm0, %edx diff --git a/llvm/test/CodeGen/X86/sse3.ll b/llvm/test/CodeGen/X86/sse3.ll --- a/llvm/test/CodeGen/X86/sse3.ll +++ b/llvm/test/CodeGen/X86/sse3.ll @@ -395,14 +395,14 @@ define <4 x i32> @t17() nounwind { ; X86-LABEL: t17: ; X86: # %bb.0: # %entry -; X86-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] ; X86-NEXT: retl ; ; X64-LABEL: t17: ; X64: # %bb.0: # %entry -; X64-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] ; X64-NEXT: retq entry: %tmp1 = load <4 x float>, ptr undef, align 16 diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll --- a/llvm/test/CodeGen/X86/sse41.ll +++ b/llvm/test/CodeGen/X86/sse41.ll @@ -1587,9 +1587,8 @@ ; X86-SSE-LABEL: insertps_from_broadcast_loadv4f32: ; X86-SSE: ## %bb.0: ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-SSE-NEXT: movups (%eax), %xmm1 ## encoding: [0x0f,0x10,0x08] -; X86-SSE-NEXT: insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30] -; X86-SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] +; X86-SSE-NEXT: insertps $48, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x00,0x30] +; X86-SSE-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] ; X86-SSE-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX1-LABEL: insertps_from_broadcast_loadv4f32: @@ -1608,9 +1607,8 @@ ; ; X64-SSE-LABEL: insertps_from_broadcast_loadv4f32: ; X64-SSE: ## %bb.0: -; X64-SSE-NEXT: movups (%rdi), %xmm1 ## encoding: [0x0f,0x10,0x0f] -; X64-SSE-NEXT: insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30] -; X64-SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] +; X64-SSE-NEXT: insertps $48, (%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x07,0x30] +; X64-SSE-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] ; X64-SSE-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX1-LABEL: insertps_from_broadcast_loadv4f32: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -1222,7 +1222,7 @@ define <2 x double> @insert_dup_mem128_v2f64(ptr %ptr) nounwind { ; SSE2-LABEL: insert_dup_mem128_v2f64: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/widened-broadcast.ll b/llvm/test/CodeGen/X86/widened-broadcast.ll --- a/llvm/test/CodeGen/X86/widened-broadcast.ll +++ b/llvm/test/CodeGen/X86/widened-broadcast.ll @@ -468,7 +468,7 @@ define <4 x float> @load_splat_4f32_8f32_0000(ptr %ptr) nounwind uwtable readnone ssp { ; SSE-LABEL: load_splat_4f32_8f32_0000: ; SSE: # %bb.0: # %entry -; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq ;