Index: lib/Target/X86/X86InterleavedAccess.cpp =================================================================== --- lib/Target/X86/X86InterleavedAccess.cpp +++ lib/Target/X86/X86InterleavedAccess.cpp @@ -72,6 +72,8 @@ void interleave8bitStride4(ArrayRef InputVectors, SmallVectorImpl &TransposedMatrix, unsigned NumSubVecElems); + void interleave8bitStride4VF8(ArrayRef InputVectors, + SmallVectorImpl &TransposedMatrix); void deinterleave8bitStride3(ArrayRef InputVectors, SmallVectorImpl &TransposedMatrix, unsigned NumSubVecElems); @@ -127,7 +129,7 @@ return true; if (ShuffleElemSize == 8 && isa(Inst) && Factor == 4 && - (WideInstSize == 512 || WideInstSize == 1024)) + (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024)) return true; if (ShuffleElemSize == 8 && isa(Inst) && Factor == 3 && @@ -218,6 +220,47 @@ VT.getVectorNumElements() / 2); } +void X86InterleavedAccessGroup::interleave8bitStride4VF8( + ArrayRef Matrix, + SmallVectorImpl &TransposedMatrix) { + // Assuming we start from the following vectors: + // Matrix[0]= c0 c1 c2 c3 c4 ... c7 + // Matrix[1]= m0 m1 m2 m3 m4 ... m7 + // Matrix[2]= y0 y1 y2 y3 y4 ... y7 + // Matrix[3]= k0 k1 k2 k3 k4 ... k7 + + MVT VT = MVT::v8i8; + MVT HalfVT = MVT::v8i16; + TransposedMatrix.resize(2); + SmallVector MaskLow; + SmallVector MaskLowTemp1, MaskLowWord; + SmallVector MaskHighTemp1, MaskHighWord; + + for (unsigned i = 0; i < 8; ++i) { + MaskLow.push_back(i); + MaskLow.push_back(i + 8); + } + + createUnpackShuffleMask(HalfVT, MaskLowTemp1, true, false); + createUnpackShuffleMask(HalfVT, MaskHighTemp1, false, false); + scaleShuffleMask(2, MaskHighTemp1, MaskHighWord); + scaleShuffleMask(2, MaskLowTemp1, MaskLowWord); + // IntrVec1Low = c0 m0 c1 m1 c2 m2 c3 m3 c4 m4 c5 m5 c6 m6 c7 m7 + // IntrVec2Low = y0 k0 y1 k1 y2 k2 y3 k3 y4 k4 y5 k5 y6 k6 y7 k7 + Value *IntrVec1Low = + Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow); + Value *IntrVec2Low = + Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow); + + // TransposedMatrix[0] = c0 m0 y0 k0 c1 m1 y1 k1 c2 m2 y2 k2 c3 m3 y3 k3 + // TransposedMatrix[1] = c4 m4 y4 k4 c5 m5 y5 k5 c6 m6 y6 k6 c7 m7 y7 k7 + + TransposedMatrix[0] = + Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord); + TransposedMatrix[1] = + Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord); +} + void X86InterleavedAccessGroup::interleave8bitStride4( ArrayRef Matrix, SmallVectorImpl &TransposedMatrix, unsigned numberOfElement) { @@ -538,6 +581,9 @@ case 4: transpose_4x4(DecomposedVectors, TransposedVectors); break; + case 8: + interleave8bitStride4VF8(DecomposedVectors, TransposedVectors); + break; case 16: case 32: interleave8bitStride4(DecomposedVectors, TransposedVectors, NumSubVecElems); Index: test/CodeGen/X86/x86-interleaved-access.ll =================================================================== --- test/CodeGen/X86/x86-interleaved-access.ll +++ test/CodeGen/X86/x86-interleaved-access.ll @@ -937,20 +937,14 @@ ; AVX1-LABEL: interleaved_store_vf8_i8_stride4: ; AVX1: # BB#0: ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,5,13,6,14,7,15,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,1,9,2,10,3,11,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: vmovaps %ymm0, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -958,20 +952,14 @@ ; AVX-LABEL: interleaved_store_vf8_i8_stride4: ; AVX: # BB#0: ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,5,13,6,14,7,15,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm2 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,1,9,2,10,3,11,u,u,u,u,u,u,u,u> ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 ; AVX-NEXT: vmovdqa %ymm0, (%rdi) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq Index: test/Transforms/InterleavedAccess/X86/interleavedStore.ll =================================================================== --- test/Transforms/InterleavedAccess/X86/interleavedStore.ll +++ test/Transforms/InterleavedAccess/X86/interleavedStore.ll @@ -67,8 +67,16 @@ ; CHECK-LABEL: @interleaved_store_vf8_i8_stride4( ; CHECK-NEXT: [[V1:%.*]] = shufflevector <8 x i8> [[X1:%.*]], <8 x i8> [[X2:%.*]], <16 x i32> ; CHECK-NEXT: [[V2:%.*]] = shufflevector <8 x i8> [[X2]], <8 x i8> [[X3:%.*]], <16 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i8> [[V1]], <16 x i8> [[V2]], <32 x i32> -; CHECK-NEXT: store <32 x i8> [[INTERLEAVED_VEC]], <32 x i8>* [[P:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[V1]], <16 x i8> [[V2]], <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[V1]], <16 x i8> [[V2]], <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[V1]], <16 x i8> [[V2]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i8> [[V1]], <16 x i8> [[V2]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP8]], <32 x i32> +; CHECK-NEXT: store <32 x i8> [[TMP9]], <32 x i8>* [[P:%.*]] ; CHECK-NEXT: ret void ; %v1 = shufflevector <8 x i8> %x1, <8 x i8> %x2, <16 x i32>