Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1163,6 +1163,47 @@ break; } + case Intrinsic::x86_ssse3_pshuf_b_128: + case Intrinsic::x86_avx2_pshuf_b: { + // Turn pshufb(V1,mask) -> shuffle(V1,Zero,mask) if mask is a constant. + auto *V = II->getArgOperand(1); + auto *VTy = cast(V->getType()); + unsigned NumElts = VTy->getNumElements(); + assert((NumElts == 16 || NumElts == 32) && + "Unexpected number of elements in shuffle mask!"); + // Initialize the resulting shuffle mask to all zeroes. + uint32_t Indexes[32] = {0}; + + if (auto *Mask = dyn_cast(V)) { + // Each byte in the shuffle control mask forms an index to permute the + // corresponding byte in the destination operand. + for (unsigned I = 0; I < NumElts; ++I) { + int8_t Index = Mask->getElementAsInteger(I); + // If the most significant bit (bit[7]) of each byte of the shuffle + // control mask is set, then zero is written in the result byte. + // The zero vector is in the right-hand side of the resulting + // shufflevector. + + // The value of each index is the least significant 4 bits of the + // shuffle control byte. + Indexes[I] = (Index < 0) ? NumElts : Index & 0xF; + } + } else if (!isa(V)) + break; + + // The value of each index for the high 128-bit lane is the least + // significant 4 bits of the respective shuffle control byte. + for (unsigned I = 16; I < NumElts; ++I) + Indexes[I] += 16; + + auto NewC = ConstantDataVector::get(V->getContext(), + makeArrayRef(Indexes, NumElts)); + auto V1 = II->getArgOperand(0); + auto V2 = Constant::getNullValue(II->getType()); + auto Shuffle = Builder->CreateShuffleVector(V1, V2, NewC); + return ReplaceInstUsesWith(CI, Shuffle); + } + case Intrinsic::x86_avx_vpermilvar_ps: case Intrinsic::x86_avx_vpermilvar_ps_256: case Intrinsic::x86_avx_vpermilvar_pd: Index: test/Transforms/InstCombine/x86-pshufb.ll =================================================================== --- test/Transforms/InstCombine/x86-pshufb.ll +++ test/Transforms/InstCombine/x86-pshufb.ll @@ -0,0 +1,214 @@ +; RUN: opt < %s -instcombine -S | FileCheck %s + +; Verify that instcombine is able to fold identity shuffles. + +define <16 x i8> @identity_test(<16 x i8> %InVec) { +; CHECK-LABEL: @identity_test +; CHECK: ret <16 x i8> %InVec + + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> ) + ret <16 x i8> %1 +} + +define <32 x i8> @identity_test_avx2(<32 x i8> %InVec) { +; CHECK-LABEL: @identity_test_avx2 +; CHECK: ret <32 x i8> %InVec + + %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> ) + ret <32 x i8> %1 +} + +; Verify that instcombine is able to fold byte shuffles with zero masks. + +define <16 x i8> @fold_to_zero_vector(<16 x i8> %InVec) { +; CHECK-LABEL: @fold_to_zero_vector +; CHECK: ret <16 x i8> zeroinitializer + + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> ) + ret <16 x i8> %1 +} + +define <32 x i8> @fold_to_zero_vector_avx2(<32 x i8> %InVec) { +; CHECK-LABEL: @fold_to_zero_vector_avx2 +; CHECK: ret <32 x i8> zeroinitializer + + %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> ) + ret <32 x i8> %1 +} + +; Instcombine should be able to fold the following byte shuffle to a builtin shufflevector +; with a shuffle mask of all zeroes. + +define <16 x i8> @splat_test(<16 x i8> %InVec) { +; CHECK-LABEL: @splat_test +; CHECK: shufflevector <16 x i8> %InVec, <16 x i8> undef, <16 x i32> zeroinitializer + + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> zeroinitializer) + ret <16 x i8> %1 +} + +; In the test case below, elements in the low 128-bit lane of the result +; vector are equal to the lower byte of %InVec (shuffle index 0). +; Elements in the high 128-bit lane of the result vector are equal to +; the lower byte in the high 128-bit lane of %InVec (shuffle index 16). + +define <32 x i8> @splat_test_avx2(<32 x i8> %InVec) { +; CHECK-LABEL: @splat_test_avx2 +; CHECK: shufflevector <32 x i8> %InVec, <32 x i8> undef, <32 x i32> + + %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> zeroinitializer) + ret <32 x i8> %1 +} + +; Each of the byte shuffles in the following tests is equivalent to a blend between +; vector %InVec and a vector of all zeroes. + +define <16 x i8> @blend1(<16 x i8> %InVec) { +; CHECK-LABEL: @blend1 +; CHECK: shufflevector <16 x i8> %InVec, {{.*}}, <16 x i32> + + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> ) + ret <16 x i8> %1 +} + +define <16 x i8> @blend2(<16 x i8> %InVec) { +; CHECK-LABEL: @blend2 +; CHECK: shufflevector <16 x i8> %InVec, {{.*}}, <16 x i32> + + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> ) + ret <16 x i8> %1 +} + +define <16 x i8> @blend3(<16 x i8> %InVec) { +; CHECK-LABEL: @blend3 +; CHECK: shufflevector <16 x i8> %InVec, {{.*}}, <16 x i32> + + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> ) + ret <16 x i8> %1 +} + +define <16 x i8> @blend4(<16 x i8> %InVec) { +; CHECK-LABEL: @blend4 +; CHECK: shufflevector <16 x i8> %InVec, {{.*}}, <16 x i32> + + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> ) + ret <16 x i8> %1 +} + +define <16 x i8> @blend5(<16 x i8> %InVec) { +; CHECK-LABEL: @blend5 +; CHECK: shufflevector <16 x i8> %InVec, {{.*}}, <16 x i32> + + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> ) + ret <16 x i8> %1 +} + +define <16 x i8> @blend6(<16 x i8> %InVec) { +; CHECK-LABEL: @blend6 +; CHECK: shufflevector <16 x i8> %InVec, {{.*}}, <16 x i32> + + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> ) + ret <16 x i8> %1 +} + +define <32 x i8> @blend1_avx2(<32 x i8> %InVec) { +; CHECK-LABEL: @blend1_avx2 +; CHECK: shufflevector <32 x i8> %InVec, {{.*}}, <32 x i32> + + %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> ) + ret <32 x i8> %1 +} + +define <32 x i8> @blend2_avx2(<32 x i8> %InVec) { +; CHECK-LABEL: @blend2_avx2 +; CHECK: shufflevector <32 x i8> %InVec, {{.*}}, <32 x i32> + + %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> ) + ret <32 x i8> %1 +} + +define <32 x i8> @blend3_avx2(<32 x i8> %InVec) { +; CHECK-LABEL: @blend3_avx2 +; CHECK: shufflevector <32 x i8> %InVec, {{.*}}, <32 x i32> + + %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> ) + ret <32 x i8> %1 +} + +define <32 x i8> @blend4_avx2(<32 x i8> %InVec) { +; CHECK-LABEL: @blend4_avx2 +; CHECK: shufflevector <32 x i8> %InVec, {{.*}}, <32 x i32> + + %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> ) + ret <32 x i8> %1 +} + +define <32 x i8> @blend5_avx2(<32 x i8> %InVec) { +; CHECK-LABEL: @blend5_avx2 +; CHECK: shufflevector <32 x i8> %InVec, {{.*}}, <32 x i32> + + %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> ) + ret <32 x i8> %1 +} + +define <32 x i8> @blend6_avx2(<32 x i8> %InVec) { +; CHECK-LABEL: @blend6_avx2 +; CHECK: shufflevector <32 x i8> %InVec, {{.*}}, <32 x i32> + + %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> ) + ret <32 x i8> %1 +} + +; movq idiom. +define <16 x i8> @movq_idiom(<16 x i8> %InVec) { +; CHECK-LABEL: @movq_idiom +; CHECK: shufflevector <16 x i8> %InVec, <16 x i8> , <16 x i32> + + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> ) + ret <16 x i8> %1 +} + +define <32 x i8> @movq_idiom_avx2(<32 x i8> %InVec) { +; CHECK-LABEL: @movq_idiom_avx2 +; CHECK: shufflevector <32 x i8> %InVec, <32 x i8> , <32 x i32> + + %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> ) + ret <32 x i8> %1 +} + +; Vector permutations using byte shuffles. + +define <16 x i8> @permute1(<16 x i8> %InVec) { +; CHECK-LABEL: @permute1 +; CHECK: shufflevector <16 x i8> %InVec, <16 x i8> undef, <16 x i32> + + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> ) + ret <16 x i8> %1 +} + +define <16 x i8> @permute2(<16 x i8> %InVec) { +; CHECK-LABEL: @permute2 +; CHECK: shufflevector <16 x i8> %InVec, <16 x i8> undef, <16 x i32> + + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> ) + ret <16 x i8> %1 +} + +define <32 x i8> @permute1_avx2(<32 x i8> %InVec) { +; CHECK-LABEL: @permute1_avx2 +; CHECK: shufflevector <32 x i8> %InVec, <32 x i8> undef, <32 x i32> + + %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> ) + ret <32 x i8> %1 +} + +define <32 x i8> @permute2_avx2(<32 x i8> %InVec) { +; CHECK-LABEL: @permute2_avx2 +; CHECK: shufflevector <32 x i8> %InVec, <32 x i8> undef, <32 x i32> + + %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> ) + ret <32 x i8> %1 +} + +declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) +declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>)