diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -23459,6 +23459,47 @@ InnerShuf->getOperand(1), CombinedMask); } +/// Given a vectorized load used in a splat, scalarize the load to only load the +/// element required for splatting. +static SDValue scalarizeLoadIntoSplat(ShuffleVectorSDNode *Shuf, + SelectionDAG &DAG) { + if (!Shuf->isSplat()) + return SDValue(); + + EVT VecVT = Shuf->getValueType(0); + unsigned SplatIdx = Shuf->getSplatIndex(); + SDValue SplattedOp; + if (SplatIdx < VecVT.getVectorNumElements()) + SplattedOp = Shuf->getOperand(0); + else + SplattedOp = Shuf->getOperand(1); + + LoadSDNode *Load = dyn_cast(Shuf->getOperand(0).getNode()); + if (!Load) + return SDValue(); + + if (!(Load->isSimple() && Load->hasOneUse() && VecVT.isVector())) + return SDValue(); + + auto &TLI = DAG.getTargetLoweringInfo(); + SDValue SplatIdxV = + DAG.getConstant(SplatIdx, SDLoc(Shuf), MVT::i32); + SDValue NewPtr = + TLI.getVectorElementPointer(DAG, Load->getBasePtr(), VecVT, SplatIdxV); + + EVT VecEltVT = VecVT.getVectorElementType(); + unsigned PtrOff = VecEltVT.getSizeInBits() * SplatIdx / 8; + MachinePointerInfo MPI = Load->getPointerInfo().getWithOffset(PtrOff); + Align Alignment = commonAlignment(Load->getAlign(), PtrOff); + + auto NewLoad = DAG.getLoad(VecEltVT, SDLoc(Load), Load->getChain(), NewPtr, + MPI, Alignment, Load->getMemOperand()->getFlags(), + Load->getAAInfo()); + DAG.makeEquivalentMemoryOrdering(Load, NewLoad); + + return DAG.getSplatBuildVector(Shuf->getValueType(0), SDLoc(Shuf), NewLoad); +} + /// If the shuffle mask is taking exactly one element from the first vector /// operand and passing through all other elements from the second vector /// operand, return the index of the mask element that is choosing an element @@ -23614,6 +23655,9 @@ if (SDValue V = formSplatFromShuffles(SVN, DAG)) return V; + if (SDValue V = scalarizeLoadIntoSplat(SVN, DAG)) + return V; + // If it is a splat, check if the argument vector is another splat or a // build_vector. if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) { diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll @@ -716,8 +716,7 @@ ; CHECK-LABEL: load_splat_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: mov z0.s, s0 +; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0] ; CHECK-NEXT: st1w { z0.s }, p0, [x8] ; CHECK-NEXT: ret %v = load <8 x float>, ptr %p @@ -729,8 +728,7 @@ ; CHECK-LABEL: load_splat_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: mov z0.d, d0 +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] ; CHECK-NEXT: st1d { z0.d }, p0, [x8] ; CHECK-NEXT: ret %v = load <4 x double>, ptr %p @@ -742,8 +740,7 @@ ; CHECK-LABEL: load_splat_v32i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] -; CHECK-NEXT: mov z0.b, b0 +; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x0] ; CHECK-NEXT: st1b { z0.b }, p0, [x8] ; CHECK-NEXT: ret %v = load <32 x i8>, ptr %p @@ -755,8 +752,7 @@ ; CHECK-LABEL: load_splat_v16i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: mov z0.h, h0 +; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0] ; CHECK-NEXT: st1h { z0.h }, p0, [x8] ; CHECK-NEXT: ret %v = load <16 x i16>, ptr %p @@ -768,8 +764,7 @@ ; CHECK-LABEL: load_splat_v8i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: mov z0.s, s0 +; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0] ; CHECK-NEXT: st1w { z0.s }, p0, [x8] ; CHECK-NEXT: ret %v = load <8 x i32>, ptr %p @@ -781,8 +776,7 @@ ; CHECK-LABEL: load_splat_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: mov z0.d, d0 +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] ; CHECK-NEXT: st1d { z0.d }, p0, [x8] ; CHECK-NEXT: ret %v = load <4 x i64>, ptr %p diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -135,24 +135,28 @@ ; GFX9-LABEL: shuffle_v4f16_u3uu: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:6 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f16_u3uu: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:6 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4f16_u3uu: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:6 ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, ptr addrspace(1) %arg0 %val1 = load <4 x half>, ptr addrspace(1) %arg1 @@ -1168,7 +1172,7 @@ ; GFX9-LABEL: shuffle_v4f16_0000: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_ushort v0, v[0:1], off ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 @@ -1179,7 +1183,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040100 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -1189,7 +1193,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: global_load_u16 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x5040100 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1902,31 +1906,31 @@ ; GFX9-LABEL: hi16bits: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: global_load_dword v5, v[2:3], off -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: global_load_dword v2, v[2:3], off +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_short_d16 v2, v[0:1], off offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: hi16bits: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: global_load_dword v5, v[2:3], off +; GFX10-NEXT: global_load_dword v2, v[2:3], off +; GFX10-NEXT: global_load_short_d16 v2, v[0:1], off offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: hi16bits: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: global_load_b32 v1, v[2:3], off +; GFX11-NEXT: global_load_b32 v2, v[2:3], off +; GFX11-NEXT: global_load_d16_b16 v2, v[0:1], off offset:2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 +; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %0 = load <2 x half>, ptr addrspace(1) %x0, align 4 @@ -1978,30 +1982,31 @@ ; GFX9-LABEL: hi16low16bits: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_ushort v4, v[0:1], off offset:2 ; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_alignbit_b32 v0, v5, v4, 16 +; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: hi16low16bits: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_ushort v4, v[0:1], off offset:2 ; GFX10-NEXT: global_load_dword v5, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, v5, v4, 16 +; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: hi16low16bits: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:2 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %0 = load <2 x half>, ptr addrspace(1) %x0, align 4 @@ -2091,30 +2096,31 @@ ; GFX9-LABEL: i16_hi16low16bits: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_ushort v4, v[0:1], off offset:2 ; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_alignbit_b32 v0, v5, v4, 16 +; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: i16_hi16low16bits: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_ushort v4, v[0:1], off offset:2 ; GFX10-NEXT: global_load_dword v5, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, v5, v4, 16 +; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: i16_hi16low16bits: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:2 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %0 = load <2 x i16>, ptr addrspace(1) %x0, align 4 @@ -2128,31 +2134,31 @@ ; GFX9-LABEL: i16_hi16bits: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: global_load_dword v5, v[2:3], off -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: global_load_dword v2, v[2:3], off +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_short_d16 v2, v[0:1], off offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: i16_hi16bits: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: global_load_dword v5, v[2:3], off +; GFX10-NEXT: global_load_dword v2, v[2:3], off +; GFX10-NEXT: global_load_short_d16 v2, v[0:1], off offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: i16_hi16bits: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: global_load_b32 v1, v[2:3], off +; GFX11-NEXT: global_load_b32 v2, v[2:3], off +; GFX11-NEXT: global_load_d16_b16 v2, v[0:1], off offset:2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 +; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %0 = load <2 x i16>, ptr addrspace(1) %x0, align 4 diff --git a/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll b/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll --- a/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll +++ b/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll @@ -8,9 +8,10 @@ define i32 @foo(ptr %descs, i32 %num, i32 %cw) local_unnamed_addr #0 { ; CHECK-LABEL: foo: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldr d16, [r0, #32] +; CHECK-NEXT: add r0, r0, #32 +; CHECK-NEXT: vld1.32 {d16[]}, [r0:32] ; CHECK-NEXT: vadd.i32 d16, d16, d16 -; CHECK-NEXT: vmov.32 r0, d16[0] +; CHECK-NEXT: vmov.32 r0, d16[1] ; CHECK-NEXT: bx lr entry: %wide.vec = load <16 x i32>, ptr %descs, align 4 diff --git a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll --- a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll +++ b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll @@ -1127,16 +1127,11 @@ ; ; CHECK-NOVSX-LABEL: testSplati64_1: ; CHECK-NOVSX: # %bb.0: # %entry -; CHECK-NOVSX-NEXT: ld r4, 8(r3) -; CHECK-NOVSX-NEXT: std r4, -8(r1) -; CHECK-NOVSX-NEXT: addis r4, r2, .LCPI21_0@toc@ha -; CHECK-NOVSX-NEXT: ld r3, 0(r3) -; CHECK-NOVSX-NEXT: addi r4, r4, .LCPI21_0@toc@l -; CHECK-NOVSX-NEXT: lvx v2, 0, r4 +; CHECK-NOVSX-NEXT: ld r3, 8(r3) +; CHECK-NOVSX-NEXT: addi r4, r1, -16 +; CHECK-NOVSX-NEXT: std r3, -8(r1) ; CHECK-NOVSX-NEXT: std r3, -16(r1) -; CHECK-NOVSX-NEXT: addi r3, r1, -16 -; CHECK-NOVSX-NEXT: lvx v3, 0, r3 -; CHECK-NOVSX-NEXT: vperm v2, v3, v3, v2 +; CHECK-NOVSX-NEXT: lvx v2, 0, r4 ; CHECK-NOVSX-NEXT: blr ; ; CHECK-P7-LABEL: testSplati64_1: @@ -1145,11 +1140,26 @@ ; CHECK-P7-NEXT: lxvdsx v2, 0, r3 ; CHECK-P7-NEXT: blr ; -; P8-AIX-LABEL: testSplati64_1: -; P8-AIX: # %bb.0: # %entry -; P8-AIX-NEXT: addi r3, r3, 8 -; P8-AIX-NEXT: lxvdsx v2, 0, r3 -; P8-AIX-NEXT: blr +; P8-AIX-64-LABEL: testSplati64_1: +; P8-AIX-64: # %bb.0: # %entry +; P8-AIX-64-NEXT: addi r3, r3, 8 +; P8-AIX-64-NEXT: lxvdsx v2, 0, r3 +; P8-AIX-64-NEXT: blr +; +; P8-AIX-32-LABEL: testSplati64_1: +; P8-AIX-32: # %bb.0: # %entry +; P8-AIX-32-NEXT: lwz r4, L..C4(r2) # %const.0 +; P8-AIX-32-NEXT: lwz r5, 12(r3) +; P8-AIX-32-NEXT: lwz r3, 8(r3) +; P8-AIX-32-NEXT: stw r5, -16(r1) +; P8-AIX-32-NEXT: stw r3, -32(r1) +; P8-AIX-32-NEXT: addi r3, r1, -16 +; P8-AIX-32-NEXT: lxvw4x v2, 0, r4 +; P8-AIX-32-NEXT: addi r4, r1, -32 +; P8-AIX-32-NEXT: lxvw4x v3, 0, r3 +; P8-AIX-32-NEXT: lxvw4x v4, 0, r4 +; P8-AIX-32-NEXT: vperm v2, v4, v3, v2 +; P8-AIX-32-NEXT: blr entry: %0 = load <2 x i64>, ptr %ptr, align 8 %1 = shufflevector <2 x i64> %0, <2 x i64> undef, <2 x i32> diff --git a/llvm/test/CodeGen/WebAssembly/simd-vectorized-load-splat.ll b/llvm/test/CodeGen/WebAssembly/simd-vectorized-load-splat.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/simd-vectorized-load-splat.ll @@ -0,0 +1,67 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s + +; Ensures that vectorized loads that are really just splatted loads, are indeed +; selected as splatted loads + +target triple = "wasm32-unknown-unknown" + +define <4 x i32> @load_splat_shuhffle_lhs(ptr %p) { +; CHECK-LABEL: load_splat_shuhffle_lhs: +; CHECK: .functype load_splat_shuhffle_lhs (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.load64_splat 0 +; CHECK-NEXT: # fallthrough-return + %a = load <2 x i64>, ptr %p + %b = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> + %c = bitcast <2 x i64> %b to <4 x i32> + %d = shufflevector <4 x i32> %c, <4 x i32> poison, <4 x i32> + ret <4 x i32> %d +} + +define <4 x i32> @load_splat_shuffle_lhs_with_offset(ptr %p) { +; CHECK-LABEL: load_splat_shuffle_lhs_with_offset: +; CHECK: .functype load_splat_shuffle_lhs_with_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const 8 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load64_splat 0 +; CHECK-NEXT: # fallthrough-return + %a = load <2 x i64>, ptr %p + %b = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> + %c = bitcast <2 x i64> %b to <4 x i32> + %d = shufflevector <4 x i32> %c, <4 x i32> poison, <4 x i32> + ret <4 x i32> %d +} + +define <4 x i32> @load_splat_shuffle_rhs(ptr %p) { +; CHECK-LABEL: load_splat_shuffle_rhs: +; CHECK: .functype load_splat_shuffle_rhs (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.load64_splat 0 +; CHECK-NEXT: # fallthrough-return + %a = load <2 x i64>, ptr %p + %b = shufflevector <2 x i64> poison, <2 x i64> %a, <2 x i32> + %c = bitcast <2 x i64> %b to <4 x i32> + %d = shufflevector <4 x i32> %c, <4 x i32> poison, <4 x i32> + ret <4 x i32> %d +} + +define <4 x i32> @load_splat_shuffle_rhs_with_offset(ptr %p) { +; CHECK-LABEL: load_splat_shuffle_rhs_with_offset: +; CHECK: .functype load_splat_shuffle_rhs_with_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const 8 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load64_splat 0 +; CHECK-NEXT: # fallthrough-return + %a = load <2 x i64>, ptr %p + %b = shufflevector <2 x i64> poison, <2 x i64> %a, <2 x i32> + %c = bitcast <2 x i64> %b to <4 x i32> + %d = shufflevector <4 x i32> %c, <4 x i32> poison, <4 x i32> + ret <4 x i32> %d +} diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll --- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll @@ -364,18 +364,18 @@ ret <4 x i32> %vecinit6.i } -; FIXME: Pointer adjusted broadcasts +; Pointer adjusted broadcasts define <4 x i32> @load_splat_4i32_4i32_1111(ptr %ptr) nounwind uwtable readnone ssp { ; X86-LABEL: load_splat_4i32_4i32_1111: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,1,1,1] +; X86-NEXT: vbroadcastss 4(%eax), %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: load_splat_4i32_4i32_1111: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,1,1,1] +; X64-NEXT: vbroadcastss 4(%rdi), %xmm0 ; X64-NEXT: retq entry: %ld = load <4 x i32>, ptr %ptr @@ -472,12 +472,12 @@ ; X86-LABEL: load_splat_2i64_2i64_1111: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] +; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; X86-NEXT: retl ; ; X64-LABEL: load_splat_2i64_2i64_1111: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] +; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; X64-NEXT: retq entry: %ld = load <2 x i64>, ptr %ptr diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll --- a/llvm/test/CodeGen/X86/extractelement-load.ll +++ b/llvm/test/CodeGen/X86/extractelement-load.ll @@ -84,7 +84,7 @@ ; X32-SSE2-LABEL: t4: ; X32-SSE2: # %bb.0: ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movdqa (%eax), %xmm0 +; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X32-SSE2-NEXT: movd %xmm0, %eax ; X32-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X32-SSE2-NEXT: movd %xmm0, %edx diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -1332,23 +1332,24 @@ define <8 x half> @shuffle(ptr %p) { ; CHECK-LIBCALL-LABEL: shuffle: ; CHECK-LIBCALL: # %bb.0: -; CHECK-LIBCALL-NEXT: movdqu (%rdi), %xmm0 -; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-LIBCALL-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; CHECK-LIBCALL-NEXT: pinsrw $0, 8(%rdi), %xmm0 +; CHECK-LIBCALL-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; CHECK-LIBCALL-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; CHECK-LIBCALL-NEXT: retq ; ; BWON-F16C-LABEL: shuffle: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,4,4,4,4] -; BWON-F16C-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; BWON-F16C-NEXT: vpinsrw $0, 8(%rdi), %xmm0, %xmm0 +; BWON-F16C-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; BWON-F16C-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: shuffle: ; CHECK-I686: # %bb.0: ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-I686-NEXT: movdqu (%eax), %xmm0 -; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-I686-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; CHECK-I686-NEXT: pinsrw $0, 8(%eax), %xmm0 +; CHECK-I686-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; CHECK-I686-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; CHECK-I686-NEXT: retl %1 = load <8 x half>, ptr %p, align 8 %2 = shufflevector <8 x half> %1, <8 x half> poison, <8 x i32> diff --git a/llvm/test/CodeGen/X86/sse3.ll b/llvm/test/CodeGen/X86/sse3.ll --- a/llvm/test/CodeGen/X86/sse3.ll +++ b/llvm/test/CodeGen/X86/sse3.ll @@ -395,14 +395,14 @@ define <4 x i32> @t17() nounwind { ; X86-LABEL: t17: ; X86: # %bb.0: # %entry -; X86-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] ; X86-NEXT: retl ; ; X64-LABEL: t17: ; X64: # %bb.0: # %entry -; X64-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] ; X64-NEXT: retq entry: %tmp1 = load <4 x float>, ptr undef, align 16 diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll --- a/llvm/test/CodeGen/X86/sse41.ll +++ b/llvm/test/CodeGen/X86/sse41.ll @@ -1587,9 +1587,8 @@ ; X86-SSE-LABEL: insertps_from_broadcast_loadv4f32: ; X86-SSE: ## %bb.0: ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-SSE-NEXT: movups (%eax), %xmm1 ## encoding: [0x0f,0x10,0x08] -; X86-SSE-NEXT: insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30] -; X86-SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] +; X86-SSE-NEXT: insertps $48, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x00,0x30] +; X86-SSE-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] ; X86-SSE-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX1-LABEL: insertps_from_broadcast_loadv4f32: @@ -1608,9 +1607,8 @@ ; ; X64-SSE-LABEL: insertps_from_broadcast_loadv4f32: ; X64-SSE: ## %bb.0: -; X64-SSE-NEXT: movups (%rdi), %xmm1 ## encoding: [0x0f,0x10,0x0f] -; X64-SSE-NEXT: insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30] -; X64-SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] +; X64-SSE-NEXT: insertps $48, (%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x07,0x30] +; X64-SSE-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] ; X64-SSE-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX1-LABEL: insertps_from_broadcast_loadv4f32: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -1222,7 +1222,7 @@ define <2 x double> @insert_dup_mem128_v2f64(ptr %ptr) nounwind { ; SSE2-LABEL: insert_dup_mem128_v2f64: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/widened-broadcast.ll b/llvm/test/CodeGen/X86/widened-broadcast.ll --- a/llvm/test/CodeGen/X86/widened-broadcast.ll +++ b/llvm/test/CodeGen/X86/widened-broadcast.ll @@ -468,7 +468,7 @@ define <4 x float> @load_splat_4f32_8f32_0000(ptr %ptr) nounwind uwtable readnone ssp { ; SSE-LABEL: load_splat_4f32_8f32_0000: ; SSE: # %bb.0: # %entry -; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq ;