Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -37006,6 +37006,22 @@ SrcElts.setBit(M); } + // As for OpInputs[Src] which has users excluding Op.getNode(), + // we assume that all elements are needed, i.e, set SrcElts.setAllBits() + // For example: + // t1317: v8i32 = insert_subvector undef:v8i32, t1414, Constant:i64<0> + // t1315: v8i32 = X86ISD::BLENDI t380, t1317, TargetConstant:i8<2> + // t1414: v4i32 = insert_vector_elt t679, t677, Constant:i64<2> + // t1416: v8i32 = X86ISD::VBROADCAST t1414 + // When getTargetShuffleInputs(...) processed t1416, it created + // NewNode: v8i32 = insert_subvector undef:v8i32, t1414, Constant:i64<0> + // which is the same with t1317. + // So getTargetShuffleInputs(...) set + // OpInputs[0] = t1317 which is used by t1315 + // Before SimplifyDemandedVectorElts processes OpInputs[0] which is used by + // t1315, we assume that all elements are needed, i.e. SrcElts.setAllBits() + if (!OpInputs[Src].isOperandOf(Op.getNode()) && !OpInputs[Src].use_empty()) + SrcElts.setAllBits(); // TODO - Propagate input undef/zero elts. APInt SrcUndef, SrcZero; if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero, Index: llvm/test/CodeGen/X86/simplifydemandedvectorselts-broadcast.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/simplifydemandedvectorselts-broadcast.ll @@ -0,0 +1,20 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s + +; Function Attrs: noinline nounwind optnone uwtable +define <16 x i32> @main(<3 x i32>* %ptr) { +; CHECK-LABEL: main: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: vpinsrd $2, 8(%rdi), %xmm0, %xmm1 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7] +; CHECK-NEXT: vpbroadcastd %xmm1, %ymm1 +; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7] +; CHECK-NEXT: retq +entry: + %int3 = load <3 x i32>, <3 x i32>* %ptr, align 1 + %0 = shufflevector <3 x i32> %int3, <3 x i32> undef, <16 x i32> + %1 = shufflevector <16 x i32> , <16 x i32> %0, <16 x i32> + ret <16 x i32 > %1 +}