Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -3362,6 +3362,33 @@ defm VMIN : avx512_binop_s_sae <0x5D, "vmin", X86fmin, X86fminRnd, SSE_ALU_ITINS_S, 1>; defm VMAX : avx512_binop_s_sae <0x5F, "vmax", X86fmax, X86fmaxRnd, SSE_ALU_ITINS_S, 1>; +// Also match the pre-AVX512 intrinsics. +multiclass avx512_sse_scalar_intrin_patterns { + let Predicates = [HasAVX512] in { + def : Pat<(!cast("int_x86_sse_"#IntStr#"_ss") VR128X:$src1, + VR128X:$src2), + (COPY_TO_REGCLASS + (!cast("V"#OpcodeStr#"SSZrr") + (COPY_TO_REGCLASS VR128X:$src1, FR32), + (COPY_TO_REGCLASS VR128X:$src2, FR32)), + VR128X)>; + def : Pat<(!cast("int_x86_sse2_"#IntStr#"_sd") VR128X:$src1, + VR128X:$src2), + (COPY_TO_REGCLASS + (!cast("V"#OpcodeStr#"SDZrr") + (COPY_TO_REGCLASS VR128X:$src1, FR64), + (COPY_TO_REGCLASS VR128X:$src2, FR64)), + VR128X)>; + } +} + +defm : avx512_sse_scalar_intrin_patterns<"ADD", "add">; +defm : avx512_sse_scalar_intrin_patterns<"MUL", "mul">; +defm : avx512_sse_scalar_intrin_patterns<"SUB", "sub">; +defm : avx512_sse_scalar_intrin_patterns<"DIV", "div">; +defm : avx512_sse_scalar_intrin_patterns<"MIN", "min">; +defm : avx512_sse_scalar_intrin_patterns<"MAX", "max">; + multiclass avx512_fp_packed opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _, bit IsCommutable> { defm rr: AVX512_maskable @test_maxss(<4 x float> %a0, <4 x float> %a1) #0 { + %res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1) + ret <4 x float> %res +} + +; CHECK-LABEL: test_maxss_rm: +; CHECK-NEXT: BB#0: +; CHECK-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7e,0x08,0x5f,0x07] +; CHECK-NEXT: retq +define <4 x float> @test_maxss_rm(<4 x float> %a0, <4 x float>* %a1) #0 { + %a1l = load <4 x float>, <4 x float>* %a1 + %res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1l) + ret <4 x float> %res +} + +; CHECK-LABEL: test_maxsd: +; CHECK-NEXT: BB#0: +; CHECK-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x5f,0xc1] +; CHECK-NEXT: retq +define <2 x double> @test_maxsd(<2 x double> %a0, <2 x double> %a1) #0 { + %res = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1) + ret <2 x double> %res +} + +; CHECK-LABEL: test_maxsd_rm: +; CHECK-NEXT: BB#0: +; CHECK-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x5f,0x07] +; CHECK-NEXT: retq +define <2 x double> @test_maxsd_rm(<2 x double> %a0, <2 x double>* %a1) #0 { + %a1l = load <2 x double>, <2 x double>* %a1 + %res = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1l) + ret <2 x double> %res +} + +; CHECK-LABEL: test_maxps: +; CHECK-NEXT: BB#0: +; CHECK-NEXT: vmaxps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x5f,0xc1] +; CHECK-NEXT: retq +define <4 x float> @test_maxps(<4 x float> %a0, <4 x float> %a1) #0 { + %res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) + ret <4 x float> %res +} + +; CHECK-LABEL: test_maxps_rm: +; CHECK-NEXT: BB#0: +; CHECK-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x5f,0x07] +; CHECK-NEXT: retq +define <4 x float> @test_maxps_rm(<4 x float> %a0, <4 x float>* %a1) #0 { + %a1l = load <4 x float>, <4 x float>* %a1 + %res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1l) + ret <4 x float> %res +} + +; CHECK-LABEL: test_maxpd: +; CHECK-NEXT: BB#0: +; CHECK-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x5f,0xc1] +; CHECK-NEXT: retq +define <2 x double> @test_maxpd(<2 x double> %a0, <2 x double> %a1) #0 { + %res = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1) + ret <2 x double> %res +} + +; CHECK-LABEL: test_maxpd_rm: +; CHECK-NEXT: BB#0: +; CHECK-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x5f,0x07] +; CHECK-NEXT: retq +define <2 x double> @test_maxpd_rm(<2 x double> %a0, <2 x double>* %a1) #0 { + %a1l = load <2 x double>, <2 x double>* %a1 + %res = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1l) + ret <2 x double> %res +} + +declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) +declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) +declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) +declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) + +;====== VMIN ==================================================================; + +; CHECK-LABEL: test_minss: +; CHECK-NEXT: BB#0: +; CHECK-NEXT: vminss %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7e,0x08,0x5d,0xc1] +; CHECK-NEXT: retq +define <4 x float> @test_minss(<4 x float> %a0, <4 x float> %a1) #0 { + %res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1) + ret <4 x float> %res +} + +; CHECK-LABEL: test_minss_rm: +; CHECK-NEXT: BB#0: +; CHECK-NEXT: vminss (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7e,0x08,0x5d,0x07] +; CHECK-NEXT: retq +define <4 x float> @test_minss_rm(<4 x float> %a0, <4 x float>* %a1) #0 { + %a1l = load <4 x float>, <4 x float>* %a1 + %res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1l) + ret <4 x float> %res +} + +; CHECK-LABEL: test_minsd: +; CHECK-NEXT: BB#0: +; CHECK-NEXT: vminsd %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x5d,0xc1] +; CHECK-NEXT: retq +define <2 x double> @test_minsd(<2 x double> %a0, <2 x double> %a1) #0 { + %res = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1) + ret <2 x double> %res +} + +; CHECK-LABEL: test_minsd_rm: +; CHECK-NEXT: BB#0: +; CHECK-NEXT: vminsd (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x5d,0x07] +; CHECK-NEXT: retq +define <2 x double> @test_minsd_rm(<2 x double> %a0, <2 x double>* %a1) #0 { + %a1l = load <2 x double>, <2 x double>* %a1 + %res = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1l) + ret <2 x double> %res +} + +; CHECK-LABEL: test_minps: +; CHECK-NEXT: BB#0: +; CHECK-NEXT: vminps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x5d,0xc1] +; CHECK-NEXT: retq +define <4 x float> @test_minps(<4 x float> %a0, <4 x float> %a1) #0 { + %res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) + ret <4 x float> %res +} + +; CHECK-LABEL: test_minps_rm: +; CHECK-NEXT: BB#0: +; CHECK-NEXT: vminps (%rdi), %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x5d,0x07] +; CHECK-NEXT: retq +define <4 x float> @test_minps_rm(<4 x float> %a0, <4 x float>* %a1) #0 { + %a1l = load <4 x float>, <4 x float>* %a1 + %res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1l) + ret <4 x float> %res +} + +; CHECK-LABEL: test_minpd: +; CHECK-NEXT: BB#0: +; CHECK-NEXT: vminpd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x5d,0xc1] +; CHECK-NEXT: retq +define <2 x double> @test_minpd(<2 x double> %a0, <2 x double> %a1) #0 { + %res = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1) + ret <2 x double> %res +} + +; CHECK-LABEL: test_minpd_rm: +; CHECK-NEXT: BB#0: +; CHECK-NEXT: vminpd (%rdi), %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x5d,0x07] +; CHECK-NEXT: retq +define <2 x double> @test_minpd_rm(<2 x double> %a0, <2 x double>* %a1) #0 { + %a1l = load <2 x double>, <2 x double>* %a1 + %res = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1l) + ret <2 x double> %res +} + +declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) +declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) +declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) +declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) + +;====== VADD ==================================================================; + +; CHECK-LABEL: test_addss: +; CHECK-NEXT: BB#0: +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7e,0x08,0x58,0xc1] +; CHECK-NEXT: retq +define <4 x float> @test_addss(<4 x float> %a0, <4 x float> %a1) #0 { + %res = call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a0, <4 x float> %a1) + ret <4 x float> %res +} + +; CHECK-LABEL: test_addss_rm: +; CHECK-NEXT: BB#0: +; CHECK-NEXT: vaddss (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7e,0x08,0x58,0x07] +; CHECK-NEXT: retq +define <4 x float> @test_addss_rm(<4 x float> %a0, <4 x float>* %a1) #0 { + %a1l = load <4 x float>, <4 x float>* %a1 + %res = call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a0, <4 x float> %a1l) + ret <4 x float> %res +} + +; CHECK-LABEL: test_addsd: +; CHECK-NEXT: BB#0: +; CHECK-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x58,0xc1] +; CHECK-NEXT: retq +define <2 x double> @test_addsd(<2 x double> %a0, <2 x double> %a1) #0 { + %res = call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %a0, <2 x double> %a1) + ret <2 x double> %res +} + +; CHECK-LABEL: test_addsd_rm: +; CHECK-NEXT: BB#0: +; CHECK-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x58,0x07] +; CHECK-NEXT: retq +define <2 x double> @test_addsd_rm(<2 x double> %a0, <2 x double>* %a1) #0 { + %a1l = load <2 x double>, <2 x double>* %a1 + %res = call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %a0, <2 x double> %a1l) + ret <2 x double> %res +} + +declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) +declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) + +;====== VSUB ==================================================================; + +; CHECK-LABEL: test_subss: +; CHECK-NEXT: BB#0: +; CHECK-NEXT: vsubss %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7e,0x08,0x5c,0xc1] +; CHECK-NEXT: retq +define <4 x float> @test_subss(<4 x float> %a0, <4 x float> %a1) #0 { + %res = call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %a0, <4 x float> %a1) + ret <4 x float> %res +} + +; CHECK-LABEL: test_subss_rm: +; CHECK-NEXT: BB#0: +; CHECK-NEXT: vsubss (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7e,0x08,0x5c,0x07] +; CHECK-NEXT: retq +define <4 x float> @test_subss_rm(<4 x float> %a0, <4 x float>* %a1) #0 { + %a1l = load <4 x float>, <4 x float>* %a1 + %res = call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %a0, <4 x float> %a1l) + ret <4 x float> %res +} + +; CHECK-LABEL: test_subsd: +; CHECK-NEXT: BB#0: +; CHECK-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x5c,0xc1] +; CHECK-NEXT: retq +define <2 x double> @test_subsd(<2 x double> %a0, <2 x double> %a1) #0 { + %res = call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %a0, <2 x double> %a1) + ret <2 x double> %res +} + +; CHECK-LABEL: test_subsd_rm: +; CHECK-NEXT: BB#0: +; CHECK-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x5c,0x07] +; CHECK-NEXT: retq +define <2 x double> @test_subsd_rm(<2 x double> %a0, <2 x double>* %a1) #0 { + %a1l = load <2 x double>, <2 x double>* %a1 + %res = call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %a0, <2 x double> %a1l) + ret <2 x double> %res +} + +declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) +declare <2 x double> @llvm.x86.sse2.sub.sd(<2 x double>, <2 x double>) + +;====== VMUL ==================================================================; + +; CHECK-LABEL: test_mulss: +; CHECK-NEXT: BB#0: +; CHECK-NEXT: vmulss %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7e,0x08,0x59,0xc1] +; CHECK-NEXT: retq +define <4 x float> @test_mulss(<4 x float> %a0, <4 x float> %a1) #0 { + %res = call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %a0, <4 x float> %a1) + ret <4 x float> %res +} + +; CHECK-LABEL: test_mulss_rm: +; CHECK-NEXT: BB#0: +; CHECK-NEXT: vmulss (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7e,0x08,0x59,0x07] +; CHECK-NEXT: retq +define <4 x float> @test_mulss_rm(<4 x float> %a0, <4 x float>* %a1) #0 { + %a1l = load <4 x float>, <4 x float>* %a1 + %res = call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %a0, <4 x float> %a1l) + ret <4 x float> %res +} + +; CHECK-LABEL: test_mulsd: +; CHECK-NEXT: BB#0: +; CHECK-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x59,0xc1] +; CHECK-NEXT: retq +define <2 x double> @test_mulsd(<2 x double> %a0, <2 x double> %a1) #0 { + %res = call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %a0, <2 x double> %a1) + ret <2 x double> %res +} + +; CHECK-LABEL: test_mulsd_rm: +; CHECK-NEXT: BB#0: +; CHECK-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x59,0x07] +; CHECK-NEXT: retq +define <2 x double> @test_mulsd_rm(<2 x double> %a0, <2 x double>* %a1) #0 { + %a1l = load <2 x double>, <2 x double>* %a1 + %res = call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %a0, <2 x double> %a1l) + ret <2 x double> %res +} + +declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) +declare <2 x double> @llvm.x86.sse2.mul.sd(<2 x double>, <2 x double>) + +;====== VDIV ==================================================================; + +; CHECK-LABEL: test_divss: +; CHECK-NEXT: BB#0: +; CHECK-NEXT: vdivss %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7e,0x08,0x5e,0xc1] +; CHECK-NEXT: retq +define <4 x float> @test_divss(<4 x float> %a0, <4 x float> %a1) #0 { + %res = call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %a0, <4 x float> %a1) + ret <4 x float> %res +} + +; CHECK-LABEL: test_divss_rm: +; CHECK-NEXT: BB#0: +; CHECK-NEXT: vdivss (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7e,0x08,0x5e,0x07] +; CHECK-NEXT: retq +define <4 x float> @test_divss_rm(<4 x float> %a0, <4 x float>* %a1) #0 { + %a1l = load <4 x float>, <4 x float>* %a1 + %res = call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %a0, <4 x float> %a1l) + ret <4 x float> %res +} + +; CHECK-LABEL: test_divsd: +; CHECK-NEXT: BB#0: +; CHECK-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x5e,0xc1] +; CHECK-NEXT: retq +define <2 x double> @test_divsd(<2 x double> %a0, <2 x double> %a1) #0 { + %res = call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %a0, <2 x double> %a1) + ret <2 x double> %res +} + +; CHECK-LABEL: test_divsd_rm: +; CHECK-NEXT: BB#0: +; CHECK-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x5e,0x07] +; CHECK-NEXT: retq +define <2 x double> @test_divsd_rm(<2 x double> %a0, <2 x double>* %a1) #0 { + %a1l = load <2 x double>, <2 x double>* %a1 + %res = call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %a0, <2 x double> %a1l) + ret <2 x double> %res +} + +declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>) +declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) + +attributes #0 = { nounwind }