diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -23021,6 +23021,23 @@ if (auto *Idx = dyn_cast(N0.getOperand(2))) if (Idx->getAPIntValue() == SplatIndex) return DAG.getSplatBuildVector(VT, SDLoc(N), N0.getOperand(1)); + + // Look through a bitcast if LE and splatting lane 0, through to a + // scalar_to_vector or a build_vector. + if (N0.getOpcode() == ISD::BITCAST && N0.getOperand(0).hasOneUse() && + SplatIndex == 0 && DAG.getDataLayout().isLittleEndian() && + (N0.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR || + N0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR)) { + EVT N00VT = N0.getOperand(0).getValueType(); + if (VT.getScalarSizeInBits() <= N00VT.getScalarSizeInBits() && + VT.isInteger() && N00VT.isInteger()) { + EVT InVT = + TLI.getTypeToTransformTo(*DAG.getContext(), VT.getScalarType()); + SDValue Op = DAG.getZExtOrTrunc(N0.getOperand(0).getOperand(0), + SDLoc(N), InVT); + return DAG.getSplatBuildVector(VT, SDLoc(N), Op); + } + } } // If this is a bit convert that changes the element type of the vector but diff --git a/llvm/test/CodeGen/AArch64/arm64-dup.ll b/llvm/test/CodeGen/AArch64/arm64-dup.ll --- a/llvm/test/CodeGen/AArch64/arm64-dup.ll +++ b/llvm/test/CodeGen/AArch64/arm64-dup.ll @@ -508,8 +508,7 @@ define <8 x i16> @bitcast_i64_v8i16(i64 %a) { ; CHECK-LABEL: bitcast_i64_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov d0, x0 -; CHECK-NEXT: dup.8h v0, v0[0] +; CHECK-NEXT: dup.8h v0, w0 ; CHECK-NEXT: ret %b = bitcast i64 %a to <4 x i16> %r = shufflevector <4 x i16> %b, <4 x i16> poison, <8 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll --- a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll @@ -2538,12 +2538,11 @@ ; CHECK-LABEL: cmplx_mul_combined_re_im: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: lsr x8, x0, #16 +; CHECK-NEXT: adrp x9, .LCPI196_0 ; CHECK-NEXT: fmov d4, x0 ; CHECK-NEXT: rev32 v5.8h, v0.8h -; CHECK-NEXT: fmov d1, x8 -; CHECK-NEXT: adrp x8, .LCPI196_0 -; CHECK-NEXT: dup v1.8h, v1.h[0] -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI196_0] +; CHECK-NEXT: dup v1.8h, w8 +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI196_0] ; CHECK-NEXT: sqneg v2.8h, v1.8h ; CHECK-NEXT: tbl v1.16b, { v1.16b, v2.16b }, v3.16b ; CHECK-NEXT: sqdmull v2.4s, v0.4h, v4.h[0] diff --git a/llvm/test/CodeGen/Thumb2/mve-vdup.ll b/llvm/test/CodeGen/Thumb2/mve-vdup.ll --- a/llvm/test/CodeGen/Thumb2/mve-vdup.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vdup.ll @@ -399,8 +399,6 @@ define arm_aapcs_vfpcc <8 x i16> @bitcast_i128_v8i16(i128 %a) { ; CHECK-LE-LABEL: bitcast_i128_v8i16: ; CHECK-LE: @ %bb.0: -; CHECK-LE-NEXT: vmov.32 q0[0], r0 -; CHECK-LE-NEXT: vmov.u16 r0, q0[0] ; CHECK-LE-NEXT: vdup.16 q0, r0 ; CHECK-LE-NEXT: bx lr ; @@ -549,8 +547,6 @@ define arm_aapcs_vfpcc <8 x i16> @other_max_case(i32 %blockSize) { ; CHECK-LE-LABEL: other_max_case: ; CHECK-LE: @ %bb.0: -; CHECK-LE-NEXT: vmov.32 q0[0], r0 -; CHECK-LE-NEXT: vmov.u16 r0, q0[0] ; CHECK-LE-NEXT: vdup.16 q0, r0 ; CHECK-LE-NEXT: bx lr ; diff --git a/llvm/test/CodeGen/WebAssembly/simd-shuffle-bitcast.ll b/llvm/test/CodeGen/WebAssembly/simd-shuffle-bitcast.ll --- a/llvm/test/CodeGen/WebAssembly/simd-shuffle-bitcast.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-shuffle-bitcast.ll @@ -1,8 +1,7 @@ ; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s ; Test that a splat shuffle of an fp-to-int bitcasted vector correctly -; optimizes and lowers to a single splat instruction. Without a custom -; DAG combine, this ends up doing both a splat and a shuffle. +; optimizes and lowers to a single splat instruction. target triple = "wasm32-unknown-unknown" @@ -19,8 +18,8 @@ ; CHECK-LABEL: not_a_vec: ; CHECK-NEXT: .functype not_a_vec (i64, i64) -> (v128){{$}} -; CHECK-NEXT: i64x2.splat $push[[L1:[0-9]+]]=, $0{{$}} -; CHECK-NEXT: i8x16.shuffle $push[[R:[0-9]+]]=, $pop[[L1]], $2, 0, 1, 2, 3 +; CHECK-NEXT: i32.wrap_i64 $push[[L:[0-9]+]]=, $0 +; CHECK-NEXT: i32x4.splat $push[[R:[0-9]+]]=, $pop[[L]] ; CHECK-NEXT: return $pop[[R]] define <4 x i32> @not_a_vec(i128 %x) { %a = bitcast i128 %x to <4 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll b/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll @@ -74,8 +74,8 @@ ; X64-LABEL: test2: ; X64: ## %bb.0: ## %entry ; X64-NEXT: movq _tmp_V2i@GOTPCREL(%rip), %rax -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-NEXT: movq %xmm0, (%rax) ; X64-NEXT: retq entry: