Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -6175,6 +6175,51 @@ defm PMOVZXDQ : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq, SSE_INTALU_ITINS_SHUFF_P>; +let Predicates = [HasAVX2] in { + // Common patterns involving scalar load. + def : Pat<(int_x86_avx2_pmovsxbw (vzmovl_v2i64 addr:$src)), + (VPMOVSXBWYrm addr:$src)>; + def : Pat<(int_x86_avx2_pmovsxbw (vzload_v2i64 addr:$src)), + (VPMOVSXBWYrm addr:$src)>; + def : Pat<(int_x86_avx2_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))), + (VPMOVSXBWYrm addr:$src)>; + + def : Pat<(int_x86_avx2_pmovsxwd (vzmovl_v2i64 addr:$src)), + (VPMOVSXWDYrm addr:$src)>; + def : Pat<(int_x86_avx2_pmovsxwd (vzload_v2i64 addr:$src)), + (VPMOVSXWDYrm addr:$src)>; + def : Pat<(int_x86_avx2_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))), + (VPMOVSXWDYrm addr:$src)>; + + def : Pat<(int_x86_avx2_pmovsxdq (vzmovl_v2i64 addr:$src)), + (VPMOVSXDQYrm addr:$src)>; + def : Pat<(int_x86_avx2_pmovsxdq (vzload_v2i64 addr:$src)), + (VPMOVSXDQYrm addr:$src)>; + def : Pat<(int_x86_avx2_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))), + (VPMOVSXDQYrm addr:$src)>; + + def : Pat<(int_x86_avx2_pmovzxbw (vzmovl_v2i64 addr:$src)), + (VPMOVZXBWYrm addr:$src)>; + def : Pat<(int_x86_avx2_pmovzxbw (vzload_v2i64 addr:$src)), + (VPMOVZXBWYrm addr:$src)>; + def : Pat<(int_x86_avx2_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))), + (VPMOVZXBWYrm addr:$src)>; + + def : Pat<(int_x86_avx2_pmovzxwd (vzmovl_v2i64 addr:$src)), + (VPMOVZXWDYrm addr:$src)>; + def : Pat<(int_x86_avx2_pmovzxwd (vzload_v2i64 addr:$src)), + (VPMOVZXWDYrm addr:$src)>; + def : Pat<(int_x86_avx2_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))), + (VPMOVZXWDYrm addr:$src)>; + + def : Pat<(int_x86_avx2_pmovzxdq (vzmovl_v2i64 addr:$src)), + (VPMOVZXDQYrm addr:$src)>; + def : Pat<(int_x86_avx2_pmovzxdq (vzload_v2i64 addr:$src)), + (VPMOVZXDQYrm addr:$src)>; + def : Pat<(int_x86_avx2_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))), + (VPMOVZXDQYrm addr:$src)>; +} + let Predicates = [HasAVX] in { // Common patterns involving scalar load. def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)), @@ -6327,6 +6372,19 @@ defm PMOVZXWQ : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq, SSE_INTALU_ITINS_SHUFF_P>; +let Predicates = [HasAVX2] in { + // Common patterns involving scalar load + def : Pat<(int_x86_avx2_pmovsxbd (vzmovl_v2i64 addr:$src)), + (VPMOVSXBDYrm addr:$src)>; + def : Pat<(int_x86_avx2_pmovsxwq (vzmovl_v2i64 addr:$src)), + (VPMOVSXWQYrm addr:$src)>; + + def : Pat<(int_x86_avx2_pmovzxbd (vzmovl_v2i64 addr:$src)), + (VPMOVZXBDYrm addr:$src)>; + def : Pat<(int_x86_avx2_pmovzxwq (vzmovl_v2i64 addr:$src)), + (VPMOVZXWQYrm addr:$src)>; +} + let Predicates = [HasAVX] in { // Common patterns involving scalar load def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)), @@ -6447,6 +6505,15 @@ (VPMOVSXBQYrm addr:$src)>; } +let Predicates = [HasAVX2] in { + // Common patterns involving scalar load + def : Pat<(int_x86_avx2_pmovsxbq (vzmovl_v4i32 addr:$src)), + (VPMOVSXBQYrm addr:$src)>; + + def : Pat<(int_x86_avx2_pmovzxbq (vzmovl_v4i32 addr:$src)), + (VPMOVZXBQYrm addr:$src)>; +} + let Predicates = [HasAVX] in { // Common patterns involving scalar load def : Pat<(int_x86_sse41_pmovsxbq Index: test/CodeGen/X86/avx2-pmovxrm-intrinsics.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/avx2-pmovxrm-intrinsics.ll @@ -0,0 +1,159 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=avx2 | FileCheck %s + +; CHECK-LABEL: test_x86_avx2_pmovsxbd +; CHECK: vpmovsxbd (%rdi), %ymm0 +define <8 x i32> @test_x86_avx2_pmovsxbd(i64* %a0) { + %1 = load i64* %a0, align 1 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <16 x i8> + %res = call <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8> %4) + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8>) nounwind readnone + + +; CHECK-LABEL: test_x86_avx2_pmovsxbq +; CHECK: vpmovsxbq (%rdi), %ymm0 +define <4 x i64> @test_x86_avx2_pmovsxbq(i32* %a0) { + %1 = load i32* %a0, align 1 + %2 = insertelement <4 x i32> undef, i32 %1, i32 0 + %3 = insertelement <4 x i32> %2, i32 0, i32 1 + %4 = insertelement <4 x i32> %3, i32 0, i32 2 + %5 = insertelement <4 x i32> %4, i32 0, i32 3 + %6 = bitcast <4 x i32> %5 to <16 x i8> + %res = call <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8> %6) + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8>) nounwind readnone + + +; CHECK-LABEL: test_x86_avx2_pmovsxbw +; CHECK: vpmovsxbw (%rdi), %ymm0 +define <16 x i16> @test_x86_avx2_pmovsxbw(i64* %a0) { + %1 = load i64* %a0, align 1 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <16 x i8> + %res = call <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8> %4) + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8>) nounwind readnone + + +; CHECK-LABEL: test_x86_avx2_pmovsxdq +; CHECK: vpmovsxdq (%rdi), %ymm0 +define <4 x i64> @test_x86_avx2_pmovsxdq(i64* %a0) { + %1 = load i64* %a0, align 1 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <4 x i32> + %res = call <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32> %4) + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32>) nounwind readnone + + +; CHECK-LABEL: test_x86_avx2_pmovsxwd +; CHECK: vpmovsxwd (%rdi), %ymm0 +define <8 x i32> @test_x86_avx2_pmovsxwd(i64* %a0) { + %1 = load i64* %a0, align 1 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <8 x i16> + %res = call <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16> %4) + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16>) nounwind readnone + + +; CHECK-LABEL: test_x86_avx2_pmovsxwq +; CHECK: vpmovsxwq (%rdi), %ymm0 +define <4 x i64> @test_x86_avx2_pmovsxwq(i64* %a0) { + %1 = load i64* %a0, align 1 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <8 x i16> + %res = call <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16> %4) + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16>) nounwind readnone + +; CHECK-LABEL: test_x86_avx2_pmovzxbd +; CHECK: vpmovzxbd (%rdi), %ymm0 +define <8 x i32> @test_x86_avx2_pmovzxbd(i64* %a0) { + %1 = load i64* %a0, align 1 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <16 x i8> + %res = call <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8> %4) + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8>) nounwind readnone + + +; CHECK-LABEL: test_x86_avx2_pmovzxbq +; CHECK: vpmovzxbq (%rdi), %ymm0 +define <4 x i64> @test_x86_avx2_pmovzxbq(i32* %a0) { + %1 = load i32* %a0, align 1 + %2 = insertelement <4 x i32> undef, i32 %1, i32 0 + %3 = insertelement <4 x i32> %2, i32 0, i32 1 + %4 = insertelement <4 x i32> %3, i32 0, i32 2 + %5 = insertelement <4 x i32> %4, i32 0, i32 3 + %6 = bitcast <4 x i32> %5 to <16 x i8> + %res = call <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8> %6) + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8>) nounwind readnone + + +; CHECK-LABEL: test_x86_avx2_pmovzxbw +; CHECK: vpmovzxbw (%rdi), %ymm0 +define <16 x i16> @test_x86_avx2_pmovzxbw(i64* %a0) { + %1 = load i64* %a0, align 1 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <16 x i8> + %res = call <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8> %4) + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8>) nounwind readnone + + +; CHECK-LABEL: test_x86_avx2_pmovzxdq +; CHECK: vpmovzxdq (%rdi), %ymm0 +define <4 x i64> @test_x86_avx2_pmovzxdq(i64* %a0) { + %1 = load i64* %a0, align 1 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <4 x i32> + %res = call <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32> %4) + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32>) nounwind readnone + + +; CHECK-LABEL: test_x86_avx2_pmovzxwd +; CHECK: vpmovzxwd (%rdi), %ymm0 +define <8 x i32> @test_x86_avx2_pmovzxwd(i64* %a0) { + %1 = load i64* %a0, align 1 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <8 x i16> + %res = call <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16> %4) + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16>) nounwind readnone + + +; CHECK-LABEL: test_x86_avx2_pmovzxwq +; CHECK: vpmovzxwq (%rdi), %ymm0 +define <4 x i64> @test_x86_avx2_pmovzxwq(i64* %a0) { + %1 = load i64* %a0, align 1 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <8 x i16> + %res = call <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16> %4) + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16>) nounwind readnone