Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -9383,6 +9383,30 @@ DAG.getConstant(PermMask, DL, MVT::i8)); } +/// \brief Handle lowering 4-lane 128-bit shuffles. +static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef WidenedMask, + SelectionDAG &DAG) { + + assert(WidenedMask.size() == 4 && "Unexpected mask size for 128bit shuffle!"); + // form a 128-bit permutation. + // convert the 64-bit shuffle mask selection values into 128-bit selection + // bits defined by a vshuf64x2 instruction's immediate control byte. + unsigned PermMask = 0, Imm = 0; + + for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) { + if(WidenedMask[i] == SM_SentinelZero) + return SDValue(); + + // use first element in place of undef musk + Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i]; + PermMask |= (Imm % 4) << (i * 2); + } + + return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2, + DAG.getConstant(PermMask, DL, MVT::i8)); +} + /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then /// shuffling each lane. /// @@ -10176,6 +10200,10 @@ ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + SmallVector WidenedMask; + if (canWidenShuffleElements(Mask, WidenedMask)) + if(SDValue Op = lowerV4X128VectorShuffle(DL, VT, V1, V2, WidenedMask, DAG)) + return Op; // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14})) Index: llvm/trunk/test/CodeGen/X86/avx512-shuffle.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-shuffle.ll +++ llvm/trunk/test/CodeGen/X86/avx512-shuffle.ll @@ -116,10 +116,10 @@ ret <16 x i32> %b } ; CHECK-LABEL: test16 -; CHECK: valignq $2, %zmm0, %zmm1 +; CHECK: valignq $3, %zmm0, %zmm1 ; CHECK: ret define <8 x double> @test16(<8 x double> %a, <8 x double> %b) nounwind { - %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> + %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %c } @@ -252,6 +252,62 @@ ret <8 x double> %c } +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s +define <8 x double> @test_vshuff64x2_512(<8 x double> %x, <8 x double> %x1) nounwind { +; CHECK-LABEL: test_vshuff64x2_512: +; CHECK: ## BB#0: +; CHECK-NEXT: vshuff64x2 $136, %zmm0, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> + ret <8 x double> %res +} + +define <8 x double> @test_vshuff64x2_512_mask(<8 x double> %x, <8 x double> %x1, <8 x i1> %mask) nounwind { +; CHECK-LABEL: test_vshuff64x2_512_mask: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmovsxwq %xmm2, %zmm1 +; CHECK-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1 +; CHECK-NEXT: vshuff64x2 $136, %zmm0, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %y = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> + %res = select <8 x i1> %mask, <8 x double> %y, <8 x double> zeroinitializer + ret <8 x double> %res +} + +define <8 x i64> @test_vshufi64x2_512_mask(<8 x i64> %x, <8 x i64> %x1, <8 x i1> %mask) nounwind { +; CHECK-LABEL: test_vshufi64x2_512_mask: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmovsxwq %xmm2, %zmm1 +; CHECK-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1 +; CHECK-NEXT: vshufi64x2 $168, %zmm0, %zmm0, %zmm0 {%k1} +; CHECK-NEXT: retq + %y = shufflevector <8 x i64> %x, <8 x i64> %x1, <8 x i32> + %res = select <8 x i1> %mask, <8 x i64> %y, <8 x i64> %x + ret <8 x i64> %res +} + +define <8 x double> @test_vshuff64x2_512_mem(<8 x double> %x, <8 x double> *%ptr) nounwind { +; CHECK-LABEL: test_vshuff64x2_512_mem: +; CHECK: ## BB#0: +; CHECK-NEXT: vshuff64x2 $40, %zmm0, %zmm0, %zmm0 +; CHECK-NEXT: retq + %x1 = load <8 x double>,<8 x double> *%ptr,align 1 + %res = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> + ret <8 x double> %res +} + +define <16 x float> @test_vshuff32x4_512_mem(<16 x float> %x, <16 x float> *%ptr) nounwind { +; CHECK-LABEL: test_vshuff32x4_512_mem: +; CHECK: ## BB#0: +; CHECK-NEXT: vshuff64x2 $20, %zmm0, %zmm0, %zmm0 +; CHECK-NEXT: retq + %x1 = load <16 x float>,<16 x float> *%ptr,align 1 + %res = shufflevector <16 x float> %x, <16 x float> %x1, <16 x i32> + ret <16 x float> %res +} + define <16 x i32> @test_align_v16i32_rr(<16 x i32> %a, <16 x i32> %b) nounwind { ; CHECK-LABEL: test_align_v16i32_rr: ; CHECK: ## BB#0: Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v8.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -88,7 +88,7 @@ define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_01014545: ; ALL: # BB#0: -; ALL-NEXT: vpermpd $68, %zmm0, %zmm0 +; ALL-NEXT: vshuff64x2 $160, %zmm0, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -650,7 +650,7 @@ define <8 x i64> @shuffle_v8i64_01014545(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_01014545: ; ALL: # BB#0: -; ALL-NEXT: vpermq $68, %zmm0, %zmm0 +; ALL-NEXT: vshufi64x2 $160, %zmm0, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle