Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -27834,11 +27834,24 @@ MVT EltVT = VT.getVectorElementType(); SDLoc DL(Op.getNode()); - switch (Op.getOpcode()) { + auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1, + SDValue Op2) { + Op0 = DAG.getBitcast(VT, Op0); + DCI.AddToWorklist(Op0.getNode()); + Op1 = DAG.getBitcast(VT, Op1); + DCI.AddToWorklist(Op1.getNode()); + DCI.CombineTo(OrigOp.getNode(), + DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2)); + return true; + }; + + unsigned Opcode = Op.getOpcode(); + switch (Opcode) { case X86ISD::PALIGNR: // PALIGNR can be converted to VALIGND/Q for 128-bit vectors. if (!VT.is128BitVector()) return false; + Opcode = X86ISD::VALIGN; LLVM_FALLTHROUGH; case X86ISD::VALIGN: { if (EltVT != MVT::i32 && EltVT != MVT::i64) @@ -27851,14 +27864,17 @@ if ((ShiftAmt % EltSize) != 0) return false; Imm = ShiftAmt / EltSize; - SDValue Op0 = DAG.getBitcast(VT, Op.getOperand(0)); - DCI.AddToWorklist(Op0.getNode()); - SDValue Op1 = DAG.getBitcast(VT, Op.getOperand(1)); - DCI.AddToWorklist(Op1.getNode()); - DCI.CombineTo(OrigOp.getNode(), - DAG.getNode(X86ISD::VALIGN, DL, VT, Op0, Op1, - DAG.getConstant(Imm, DL, MVT::i8))); - return true; + return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1), + DAG.getConstant(Imm, DL, MVT::i8)); + } + case X86ISD::SHUF128: { + if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64) + return false; + // Only change element size, not type. + if (VT.isInteger() != Op.getSimpleValueType().isInteger()) + return false; + return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1), + Op.getOperand(2)); } } Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -431,16 +431,16 @@ ; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 ; AVX512F-NEXT: vpslld $31, %zmm3, %zmm3 ; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[2,3,0,1] -; AVX512F-NEXT: vblendmps %zmm0, %zmm2, %zmm0 {%k1} +; AVX512F-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] +; AVX512F-NEXT: vmovaps %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_vshuff32x4_512_mask: ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpsllw $7, %xmm3, %xmm3 ; AVX512BW-NEXT: vpmovb2m %zmm3, %k1 -; AVX512BW-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[2,3,0,1] -; AVX512BW-NEXT: vblendmps %zmm0, %zmm2, %zmm0 {%k1} +; AVX512BW-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] +; AVX512BW-NEXT: vmovaps %zmm2, %zmm0 ; AVX512BW-NEXT: retq %x2 = shufflevector <16 x float> %x, <16 x float> %x1, <16 x i32> %res = select <16 x i1> %mask, <16 x float> %x2, <16 x float> %y @@ -453,16 +453,16 @@ ; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 ; AVX512F-NEXT: vpslld $31, %zmm3, %zmm3 ; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[2,3,0,1] -; AVX512F-NEXT: vpblendmd %zmm0, %zmm2, %zmm0 {%k1} +; AVX512F-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_vshufi32x4_512_mask: ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpsllw $7, %xmm3, %xmm3 ; AVX512BW-NEXT: vpmovb2m %zmm3, %k1 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[2,3,0,1] -; AVX512BW-NEXT: vpblendmd %zmm0, %zmm2, %zmm0 {%k1} +; AVX512BW-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: retq %x2 = shufflevector <16 x i32> %x, <16 x i32> %x1, <16 x i32> %res = select <16 x i1> %mask, <16 x i32> %x2, <16 x i32> %y