diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -9689,6 +9689,9 @@ return SDValue(); } +static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG); + /// If a BUILD_VECTOR's source elements all apply the same bit operation and /// one of their operands is constant, lower to a pair of BUILD_VECTOR and /// just apply the bit to the vectors. @@ -9696,6 +9699,7 @@ /// from this, but enough scalar bit operations are created from the later /// legalization + scalarization stages to need basic support. static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); MVT VT = Op->getSimpleValueType(0); @@ -9759,7 +9763,14 @@ SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts); SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts); - return DAG.getNode(Opcode, DL, VT, LHS, RHS); + SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS); + + if (!IsShift) + return Res; + + // Immediately lower the shift to ensure the constant build vector doesn't + // get converted to a constant pool before the shift is lowered. + return LowerShift(Res, Subtarget, DAG); } /// Create a vector constant without a load. SSE/AVX provide the bare minimum @@ -10115,7 +10126,7 @@ return HorizontalOp; if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG)) return Broadcast; - if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG)) + if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG)) return BitOp; unsigned EVTBits = EltVT.getSizeInBits(); diff --git a/llvm/test/CodeGen/X86/pr46527.ll b/llvm/test/CodeGen/X86/pr46527.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr46527.ll @@ -0,0 +1,36 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +;RUN: llc < %s -mtriple=i686-unknown -mattr=sse2 -relocation-model=pic | FileCheck %s + +define void @f(<16 x i8>* %out, <16 x i8> %in, i1 %flag) { +; CHECK-LABEL: f: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: calll .L0$pb +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: .L0$pb: +; CHECK-NEXT: popl %eax +; CHECK-NEXT: .cfi_adjust_cfa_offset -4 +; CHECK-NEXT: .Ltmp0: +; CHECK-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movb {{[0-9]+}}(%esp), %dl +; CHECK-NEXT: notb %dl +; CHECK-NEXT: andb $1, %dl +; CHECK-NEXT: movzbl %dl, %edx +; CHECK-NEXT: movd %edx, %xmm1 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; CHECK-NEXT: paddb %xmm1, %xmm1 +; CHECK-NEXT: pxor %xmm0, %xmm1 +; CHECK-NEXT: pxor {{\.LCPI.*}}@GOTOFF(%eax), %xmm1 +; CHECK-NEXT: movdqa %xmm1, (%ecx) +; CHECK-NEXT: retl +entry: + %0 = select i1 %flag, i8 0, i8 2 + %1 = insertelement <16 x i8> undef, i8 %0, i32 0 + %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer + %3 = xor <16 x i8> %2, %in + %4 = xor <16 x i8> %3, + store <16 x i8> %4, <16 x i8>* %out, align 16 + ret void +}