Index: lib/Target/X86/X86InstrAVX512.td
===================================================================
--- lib/Target/X86/X86InstrAVX512.td
+++ lib/Target/X86/X86InstrAVX512.td
@@ -3362,6 +3362,33 @@
 defm VMIN : avx512_binop_s_sae  <0x5D, "vmin", X86fmin, X86fminRnd, SSE_ALU_ITINS_S, 1>;
 defm VMAX : avx512_binop_s_sae  <0x5F, "vmax", X86fmax, X86fmaxRnd, SSE_ALU_ITINS_S, 1>;
 
+// Also match the pre-AVX512 intrinsics.
+multiclass avx512_sse_scalar_intrin_patterns<string OpcodeStr, string IntStr> {
+  let Predicates = [HasAVX512] in {
+    def : Pat<(!cast<Intrinsic>("int_x86_sse_"#IntStr#"_ss") VR128X:$src1,
+                                                             VR128X:$src2),
+              (COPY_TO_REGCLASS
+                (!cast<Instruction>("V"#OpcodeStr#"SSZrr")
+                  (COPY_TO_REGCLASS VR128X:$src1, FR32),
+                  (COPY_TO_REGCLASS VR128X:$src2, FR32)),
+               VR128X)>;
+    def : Pat<(!cast<Intrinsic>("int_x86_sse2_"#IntStr#"_sd") VR128X:$src1,
+                                                              VR128X:$src2),
+              (COPY_TO_REGCLASS
+                (!cast<Instruction>("V"#OpcodeStr#"SDZrr")
+                  (COPY_TO_REGCLASS VR128X:$src1, FR64),
+                  (COPY_TO_REGCLASS VR128X:$src2, FR64)),
+               VR128X)>;
+  }
+}
+
+defm : avx512_sse_scalar_intrin_patterns<"ADD", "add">;
+defm : avx512_sse_scalar_intrin_patterns<"MUL", "mul">;
+defm : avx512_sse_scalar_intrin_patterns<"SUB", "sub">;
+defm : avx512_sse_scalar_intrin_patterns<"DIV", "div">;
+defm : avx512_sse_scalar_intrin_patterns<"MIN", "min">;
+defm : avx512_sse_scalar_intrin_patterns<"MAX", "max">;
+
 multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             X86VectorVTInfo _, bit IsCommutable> {
   defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
Index: lib/Target/X86/X86InstrInfo.cpp
===================================================================
--- lib/Target/X86/X86InstrInfo.cpp
+++ lib/Target/X86/X86InstrInfo.cpp
@@ -1650,18 +1650,26 @@
     { X86::PEXT64rr,          X86::PEXT64rm,            0 },
 
     // AVX-512 foldable instructions
-    { X86::VADDPSZrr,         X86::VADDPSZrm,           0 },
     { X86::VADDPDZrr,         X86::VADDPDZrm,           0 },
-    { X86::VSUBPSZrr,         X86::VSUBPSZrm,           0 },
-    { X86::VSUBPDZrr,         X86::VSUBPDZrm,           0 },
-    { X86::VMULPSZrr,         X86::VMULPSZrm,           0 },
-    { X86::VMULPDZrr,         X86::VMULPDZrm,           0 },
-    { X86::VDIVPSZrr,         X86::VDIVPSZrm,           0 },
+    { X86::VADDPSZrr,         X86::VADDPSZrm,           0 },
+    { X86::VADDSDZrr,         X86::VADDSDZrm,           0 },
+    { X86::VADDSSZrr,         X86::VADDSSZrm,           0 },
     { X86::VDIVPDZrr,         X86::VDIVPDZrm,           0 },
-    { X86::VMINPSZrr,         X86::VMINPSZrm,           0 },
-    { X86::VMINPDZrr,         X86::VMINPDZrm,           0 },
-    { X86::VMAXPSZrr,         X86::VMAXPSZrm,           0 },
+    { X86::VDIVPSZrr,         X86::VDIVPSZrm,           0 },
+    { X86::VDIVSDZrr,         X86::VDIVSDZrm,           0 },
+    { X86::VDIVSSZrr,         X86::VDIVSSZrm,           0 },
     { X86::VMAXPDZrr,         X86::VMAXPDZrm,           0 },
+    { X86::VMAXPSZrr,         X86::VMAXPSZrm,           0 },
+    { X86::VMAXSDZrr,         X86::VMAXSDZrm,           0 },
+    { X86::VMAXSSZrr,         X86::VMAXSSZrm,           0 },
+    { X86::VMINPDZrr,         X86::VMINPDZrm,           0 },
+    { X86::VMINPSZrr,         X86::VMINPSZrm,           0 },
+    { X86::VMINSDZrr,         X86::VMINSDZrm,           0 },
+    { X86::VMINSSZrr,         X86::VMINSSZrm,           0 },
+    { X86::VMULPDZrr,         X86::VMULPDZrm,           0 },
+    { X86::VMULPSZrr,         X86::VMULPSZrm,           0 },
+    { X86::VMULSDZrr,         X86::VMULSDZrm,           0 },
+    { X86::VMULSSZrr,         X86::VMULSSZrm,           0 },
     { X86::VPADDDZrr,         X86::VPADDDZrm,           0 },
     { X86::VPADDQZrr,         X86::VPADDQZrm,           0 },
     { X86::VPERMPDZri,        X86::VPERMPDZmi,          0 },
@@ -1682,6 +1690,10 @@
     { X86::VPSRLVQZrr,        X86::VPSRLVQZrm,          0 },
     { X86::VPSUBDZrr,         X86::VPSUBDZrm,           0 },
     { X86::VPSUBQZrr,         X86::VPSUBQZrm,           0 },
+    { X86::VSUBPDZrr,         X86::VSUBPDZrm,           0 },
+    { X86::VSUBPSZrr,         X86::VSUBPSZrm,           0 },
+    { X86::VSUBSDZrr,         X86::VSUBSDZrm,           0 },
+    { X86::VSUBSSZrr,         X86::VSUBSSZrm,           0 },
     { X86::VSHUFPDZrri,       X86::VSHUFPDZrmi,         0 },
     { X86::VSHUFPSZrri,       X86::VSHUFPSZrmi,         0 },
     { X86::VALIGNQrri,        X86::VALIGNQrmi,          0 },
Index: test/CodeGen/X86/avx512-sse-intrinsics.ll
===================================================================
--- test/CodeGen/X86/avx512-sse-intrinsics.ll
+++ test/CodeGen/X86/avx512-sse-intrinsics.ll
@@ -0,0 +1,341 @@
+; RUN: llc < %s -asm-verbose=true -mtriple=x86_64-unknown-unknown -mattr=+avx512f --show-mc-encoding | FileCheck %s
+
+;====== VMAX ==================================================================;
+
+; CHECK-LABEL: test_maxss:
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT: vmaxss %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7e,0x08,0x5f,0xc1]
+; CHECK-NEXT: retq
+define <4 x float> @test_maxss(<4 x float> %a0, <4 x float> %a1) #0 {
+  %res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %res
+}
+
+; CHECK-LABEL: test_maxss_rm:
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7e,0x08,0x5f,0x07]
+; CHECK-NEXT: retq
+define <4 x float> @test_maxss_rm(<4 x float> %a0, <4 x float>* %a1) #0 {
+  %a1l = load <4 x float>, <4 x float>* %a1
+  %res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1l)
+  ret <4 x float> %res
+}
+
+; CHECK-LABEL: test_maxsd:
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x5f,0xc1]
+; CHECK-NEXT: retq
+define <2 x double> @test_maxsd(<2 x double> %a0, <2 x double> %a1) #0 {
+  %res = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %res
+}
+
+; CHECK-LABEL: test_maxsd_rm:
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x5f,0x07]
+; CHECK-NEXT: retq
+define <2 x double> @test_maxsd_rm(<2 x double> %a0, <2 x double>* %a1) #0 {
+  %a1l = load <2 x double>, <2 x double>* %a1
+  %res = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1l)
+  ret <2 x double> %res
+}
+
+; CHECK-LABEL: test_maxps:
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT: vmaxps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x5f,0xc1]
+; CHECK-NEXT: retq
+define <4 x float> @test_maxps(<4 x float> %a0, <4 x float> %a1) #0 {
+  %res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %res
+}
+
+; CHECK-LABEL: test_maxps_rm:
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x5f,0x07]
+; CHECK-NEXT: retq
+define <4 x float> @test_maxps_rm(<4 x float> %a0, <4 x float>* %a1) #0 {
+  %a1l = load <4 x float>, <4 x float>* %a1
+  %res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1l)
+  ret <4 x float> %res
+}
+
+; CHECK-LABEL: test_maxpd:
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x5f,0xc1]
+; CHECK-NEXT: retq
+define <2 x double> @test_maxpd(<2 x double> %a0, <2 x double> %a1) #0 {
+  %res = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %res
+}
+
+; CHECK-LABEL: test_maxpd_rm:
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x5f,0x07]
+; CHECK-NEXT: retq
+define <2 x double> @test_maxpd_rm(<2 x double> %a0, <2 x double>* %a1) #0 {
+  %a1l = load <2 x double>, <2 x double>* %a1
+  %res = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1l)
+  ret <2 x double> %res
+}
+
+declare <4 x float>  @llvm.x86.sse.max.ss(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>)
+declare <4 x float>  @llvm.x86.sse.max.ps(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>)
+
+;====== VMIN ==================================================================;
+
+; CHECK-LABEL: test_minss:
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT: vminss %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7e,0x08,0x5d,0xc1]
+; CHECK-NEXT: retq
+define <4 x float> @test_minss(<4 x float> %a0, <4 x float> %a1) #0 {
+  %res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %res
+}
+
+; CHECK-LABEL: test_minss_rm:
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT: vminss (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7e,0x08,0x5d,0x07]
+; CHECK-NEXT: retq
+define <4 x float> @test_minss_rm(<4 x float> %a0, <4 x float>* %a1) #0 {
+  %a1l = load <4 x float>, <4 x float>* %a1
+  %res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1l)
+  ret <4 x float> %res
+}
+
+; CHECK-LABEL: test_minsd:
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT: vminsd %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x5d,0xc1]
+; CHECK-NEXT: retq
+define <2 x double> @test_minsd(<2 x double> %a0, <2 x double> %a1) #0 {
+  %res = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %res
+}
+
+; CHECK-LABEL: test_minsd_rm:
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT: vminsd (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x5d,0x07]
+; CHECK-NEXT: retq
+define <2 x double> @test_minsd_rm(<2 x double> %a0, <2 x double>* %a1) #0 {
+  %a1l = load <2 x double>, <2 x double>* %a1
+  %res = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1l)
+  ret <2 x double> %res
+}
+
+; CHECK-LABEL: test_minps:
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT: vminps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x5d,0xc1]
+; CHECK-NEXT: retq
+define <4 x float> @test_minps(<4 x float> %a0, <4 x float> %a1) #0 {
+  %res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %res
+}
+
+; CHECK-LABEL: test_minps_rm:
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT: vminps (%rdi), %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x5d,0x07]
+; CHECK-NEXT: retq
+define <4 x float> @test_minps_rm(<4 x float> %a0, <4 x float>* %a1) #0 {
+  %a1l = load <4 x float>, <4 x float>* %a1
+  %res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1l)
+  ret <4 x float> %res
+}
+
+; CHECK-LABEL: test_minpd:
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT: vminpd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x5d,0xc1]
+; CHECK-NEXT: retq
+define <2 x double> @test_minpd(<2 x double> %a0, <2 x double> %a1) #0 {
+  %res = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %res
+}
+
+; CHECK-LABEL: test_minpd_rm:
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT: vminpd (%rdi), %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x5d,0x07]
+; CHECK-NEXT: retq
+define <2 x double> @test_minpd_rm(<2 x double> %a0, <2 x double>* %a1) #0 {
+  %a1l = load <2 x double>, <2 x double>* %a1
+  %res = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1l)
+  ret <2 x double> %res
+}
+
+declare <4 x float>  @llvm.x86.sse.min.ps(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>)
+declare <4 x float>  @llvm.x86.sse.min.ss(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>)
+
+;====== VADD ==================================================================;
+
+; CHECK-LABEL: test_addss:
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7e,0x08,0x58,0xc1]
+; CHECK-NEXT: retq
+define <4 x float> @test_addss(<4 x float> %a0, <4 x float> %a1) #0 {
+  %res = call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %res
+}
+
+; CHECK-LABEL: test_addss_rm:
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT: vaddss (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7e,0x08,0x58,0x07]
+; CHECK-NEXT: retq
+define <4 x float> @test_addss_rm(<4 x float> %a0, <4 x float>* %a1) #0 {
+  %a1l = load <4 x float>, <4 x float>* %a1
+  %res = call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a0, <4 x float> %a1l)
+  ret <4 x float> %res
+}
+
+; CHECK-LABEL: test_addsd:
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x58,0xc1]
+; CHECK-NEXT: retq
+define <2 x double> @test_addsd(<2 x double> %a0, <2 x double> %a1) #0 {
+  %res = call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %res
+}
+
+; CHECK-LABEL: test_addsd_rm:
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x58,0x07]
+; CHECK-NEXT: retq
+define <2 x double> @test_addsd_rm(<2 x double> %a0, <2 x double>* %a1) #0 {
+  %a1l = load <2 x double>, <2 x double>* %a1
+  %res = call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %a0, <2 x double> %a1l)
+  ret <2 x double> %res
+}
+
+declare <4 x float>  @llvm.x86.sse.add.ss(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>)
+
+;====== VSUB ==================================================================;
+
+; CHECK-LABEL: test_subss:
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT: vsubss %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7e,0x08,0x5c,0xc1]
+; CHECK-NEXT: retq
+define <4 x float> @test_subss(<4 x float> %a0, <4 x float> %a1) #0 {
+  %res = call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %res
+}
+
+; CHECK-LABEL: test_subss_rm:
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT: vsubss (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7e,0x08,0x5c,0x07]
+; CHECK-NEXT: retq
+define <4 x float> @test_subss_rm(<4 x float> %a0, <4 x float>* %a1) #0 {
+  %a1l = load <4 x float>, <4 x float>* %a1
+  %res = call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %a0, <4 x float> %a1l)
+  ret <4 x float> %res
+}
+
+; CHECK-LABEL: test_subsd:
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x5c,0xc1]
+; CHECK-NEXT: retq
+define <2 x double> @test_subsd(<2 x double> %a0, <2 x double> %a1) #0 {
+  %res = call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %res
+}
+
+; CHECK-LABEL: test_subsd_rm:
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x5c,0x07]
+; CHECK-NEXT: retq
+define <2 x double> @test_subsd_rm(<2 x double> %a0, <2 x double>* %a1) #0 {
+  %a1l = load <2 x double>, <2 x double>* %a1
+  %res = call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %a0, <2 x double> %a1l)
+  ret <2 x double> %res
+}
+
+declare <4 x float>  @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.x86.sse2.sub.sd(<2 x double>, <2 x double>)
+
+;====== VMUL ==================================================================;
+
+; CHECK-LABEL: test_mulss:
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT: vmulss %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7e,0x08,0x59,0xc1]
+; CHECK-NEXT: retq
+define <4 x float> @test_mulss(<4 x float> %a0, <4 x float> %a1) #0 {
+  %res = call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %res
+}
+
+; CHECK-LABEL: test_mulss_rm:
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT: vmulss (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7e,0x08,0x59,0x07]
+; CHECK-NEXT: retq
+define <4 x float> @test_mulss_rm(<4 x float> %a0, <4 x float>* %a1) #0 {
+  %a1l = load <4 x float>, <4 x float>* %a1
+  %res = call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %a0, <4 x float> %a1l)
+  ret <4 x float> %res
+}
+
+; CHECK-LABEL: test_mulsd:
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x59,0xc1]
+; CHECK-NEXT: retq
+define <2 x double> @test_mulsd(<2 x double> %a0, <2 x double> %a1) #0 {
+  %res = call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %res
+}
+
+; CHECK-LABEL: test_mulsd_rm:
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x59,0x07]
+; CHECK-NEXT: retq
+define <2 x double> @test_mulsd_rm(<2 x double> %a0, <2 x double>* %a1) #0 {
+  %a1l = load <2 x double>, <2 x double>* %a1
+  %res = call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %a0, <2 x double> %a1l)
+  ret <2 x double> %res
+}
+
+declare <4 x float>  @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.x86.sse2.mul.sd(<2 x double>, <2 x double>)
+
+;====== VDIV ==================================================================;
+
+; CHECK-LABEL: test_divss:
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT: vdivss %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7e,0x08,0x5e,0xc1]
+; CHECK-NEXT: retq
+define <4 x float> @test_divss(<4 x float> %a0, <4 x float> %a1) #0 {
+  %res = call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %res
+}
+
+; CHECK-LABEL: test_divss_rm:
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT: vdivss (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7e,0x08,0x5e,0x07]
+; CHECK-NEXT: retq
+define <4 x float> @test_divss_rm(<4 x float> %a0, <4 x float>* %a1) #0 {
+  %a1l = load <4 x float>, <4 x float>* %a1
+  %res = call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %a0, <4 x float> %a1l)
+  ret <4 x float> %res
+}
+
+; CHECK-LABEL: test_divsd:
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x5e,0xc1]
+; CHECK-NEXT: retq
+define <2 x double> @test_divsd(<2 x double> %a0, <2 x double> %a1) #0 {
+  %res = call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %res
+}
+
+; CHECK-LABEL: test_divsd_rm:
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x5e,0x07]
+; CHECK-NEXT: retq
+define <2 x double> @test_divsd_rm(<2 x double> %a0, <2 x double>* %a1) #0 {
+  %a1l = load <2 x double>, <2 x double>* %a1
+  %res = call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %a0, <2 x double> %a1l)
+  ret <2 x double> %res
+}
+
+declare <4 x float>  @llvm.x86.sse.div.ss(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>)
+
+attributes #0 = { nounwind }