Index: llvm/trunk/include/llvm/IR/Intrinsics.td
===================================================================
--- llvm/trunk/include/llvm/IR/Intrinsics.td
+++ llvm/trunk/include/llvm/IR/Intrinsics.td
@@ -261,6 +261,7 @@
 def llvm_v4f32_ty      : LLVMType<v4f32>;    //  4 x float
 def llvm_v8f32_ty      : LLVMType<v8f32>;    //  8 x float
 def llvm_v16f32_ty     : LLVMType<v16f32>;   // 16 x float
+def llvm_v32f32_ty     : LLVMType<v32f32>;   // 32 x float
 def llvm_v1f64_ty      : LLVMType<v1f64>;    //  1 x double
 def llvm_v2f64_ty      : LLVMType<v2f64>;    //  2 x double
 def llvm_v4f64_ty      : LLVMType<v4f64>;    //  4 x double
Index: llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
===================================================================
--- llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
+++ llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1663,8 +1663,8 @@
 def int_amdgcn_global_atomic_fadd    : AMDGPUGlobalAtomicNoRtn;
 
 // llvm.amdgcn.mfma.f32.* vdst, srcA, srcB, srcC, cbsz, abid, blgp
-def int_amdgcn_mfma_f32_32x32x1f32 : Intrinsic<[llvm_v32i32_ty],
-  [llvm_float_ty, llvm_float_ty, llvm_v32i32_ty,
+def int_amdgcn_mfma_f32_32x32x1f32 : Intrinsic<[llvm_v32f32_ty],
+  [llvm_float_ty, llvm_float_ty, llvm_v32f32_ty,
    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
 
 def int_amdgcn_mfma_f32_16x16x1f32 : Intrinsic<[llvm_v16f32_ty],
@@ -1683,8 +1683,8 @@
   [llvm_float_ty, llvm_float_ty, llvm_v4f32_ty,
    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
 
-def int_amdgcn_mfma_f32_32x32x4f16 : Intrinsic<[llvm_v32i32_ty],
-  [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v32i32_ty,
+def int_amdgcn_mfma_f32_32x32x4f16 : Intrinsic<[llvm_v32f32_ty],
+  [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v32f32_ty,
    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
 
 def int_amdgcn_mfma_f32_16x16x4f16 : Intrinsic<[llvm_v16f32_ty],
@@ -1723,8 +1723,8 @@
   [llvm_i32_ty, llvm_i32_ty, llvm_v4i32_ty,
    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
 
-def int_amdgcn_mfma_f32_32x32x2bf16 : Intrinsic<[llvm_v32i32_ty],
-  [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v32i32_ty,
+def int_amdgcn_mfma_f32_32x32x2bf16 : Intrinsic<[llvm_v32f32_ty],
+  [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v32f32_ty,
    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
 
 def int_amdgcn_mfma_f32_16x16x2bf16 : Intrinsic<[llvm_v16f32_ty],
Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -165,6 +165,9 @@
   setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
   AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
 
+  setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
+
   setOperationAction(ISD::LOAD, MVT::i64, Promote);
   AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
 
@@ -256,6 +259,9 @@
   setOperationAction(ISD::STORE, MVT::v16f32, Promote);
   AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
 
+  setOperationAction(ISD::STORE, MVT::v32f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
+
   setOperationAction(ISD::STORE, MVT::i64, Promote);
   AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
 
@@ -355,7 +361,10 @@
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom);
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom);
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom);
 
   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -153,6 +153,7 @@
 
   if (Subtarget->hasMAIInsts()) {
     addRegisterClass(MVT::v32i32, &AMDGPU::AReg_1024RegClass);
+    addRegisterClass(MVT::v32f32, &AMDGPU::AReg_1024RegClass);
   }
 
   computeRegisterProperties(Subtarget->getRegisterInfo());
@@ -263,8 +264,9 @@
 
   // We only support LOAD/STORE and vector manipulation ops for vectors
   // with > 4 elements.
-  for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
-        MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v32i32 }) {
+  for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
+                  MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16,
+                  MVT::v32i32, MVT::v32f32 }) {
     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
       switch (Op) {
       case ISD::LOAD:
Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2178,14 +2178,13 @@
 
 def VOP_V4F32_F32_F32_V4F32       : VOPProfile <[v4f32,  f32,   f32,   v4f32]>;
 def VOP_V16F32_F32_F32_V16F32     : VOPProfile <[v16f32, f32,   f32,   v16f32]>;
-// TODO: define v32f32
-def VOP_V32F32_F32_F32_V32F32     : VOPProfile <[v32i32, f32,   f32,   v32i32]>;
+def VOP_V32F32_F32_F32_V32F32     : VOPProfile <[v32f32, f32,   f32,   v32f32]>;
 def VOP_V4F32_V4F16_V4F16_V4F32   : VOPProfile <[v4f32,  v4f16, v4f16, v4f32]>;
 def VOP_V16F32_V4F16_V4F16_V16F32 : VOPProfile <[v16f32, v4f16, v4f16, v16f32]>;
-def VOP_V32F32_V4F16_V4F16_V32F32 : VOPProfile <[v32i32, v4f16, v4f16, v32i32]>;
+def VOP_V32F32_V4F16_V4F16_V32F32 : VOPProfile <[v32f32, v4f16, v4f16, v32f32]>;
 def VOP_V4F32_V2I16_V2I16_V4F32   : VOPProfile <[v4f32,  v2i16, v2i16, v4f32]>;
 def VOP_V16F32_V2I16_V2I16_V16F32 : VOPProfile <[v16f32, v2i16, v2i16, v16f32]>;
-def VOP_V32F32_V2I16_V2I16_V32F32 : VOPProfile <[v32i32, v2i16, v2i16, v32i32]>;
+def VOP_V32F32_V2I16_V2I16_V32F32 : VOPProfile <[v32f32, v2i16, v2i16, v32f32]>;
 def VOP_V4I32_I32_I32_V4I32       : VOPProfile <[v4i32,  i32,   i32,   v4i32]>;
 def VOP_V16I32_I32_I32_V16I32     : VOPProfile <[v16i32, i32,   i32,   v16i32]>;
 def VOP_V32I32_I32_I32_V32I32     : VOPProfile <[v32i32, i32,   i32,   v32i32]>;
Index: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
+++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
@@ -942,6 +942,14 @@
   def Insert_Element_v32i32_#Index : Insert_Element <
     i32, v32i32, Index, !cast<SubRegIndex>(sub#Index)
   >;
+
+  def Extract_Element_v32f32_#Index : Extract_Element <
+    f32, v32f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+
+  def Insert_Element_v32f32_#Index : Insert_Element <
+    f32, v32f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
 }
 
 // FIXME: Why do only some of these type combinations for SReg and
@@ -1034,6 +1042,10 @@
 def : BitConvert <v16i32, v16f32, VReg_512>;
 def : BitConvert <v16f32, v16i32, VReg_512>;
 
+// 1024-bit bitcast
+def : BitConvert <v32i32, v32f32, VReg_1024>;
+def : BitConvert <v32f32, v32i32, VReg_1024>;
+
 /********** =================== **********/
 /********** Src & Dst modifiers **********/
 /********** =================== **********/
Index: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.td
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.td
+++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -757,11 +757,11 @@
   let isAllocatable = 0;
 }
 
-def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32], 32, (add SGPR_1024Regs)> {
+def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, (add SGPR_1024Regs)> {
   let AllocationPriority = 19;
 }
 
-def SReg_1024 : RegisterClass<"AMDGPU", [v32i32], 32,
+def SReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
   (add SGPR_1024)> {
   let CopyCost = 16;
   let AllocationPriority = 19;
@@ -812,7 +812,7 @@
   let AllocationPriority = 7;
 }
 
-def VReg_1024 : RegisterClass<"AMDGPU", [v32i32], 32, (add VGPR_1024)> {
+def VReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, (add VGPR_1024)> {
   let Size = 1024;
   let CopyCost = 32;
   let AllocationPriority = 8;
@@ -840,7 +840,7 @@
 }
 
 // TODO: add v32f32 value type
-def AReg_1024 : RegisterClass<"AMDGPU", [v32i32], 32, (add AGPR_1024)> {
+def AReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, (add AGPR_1024)> {
   let Size = 1024;
   let CopyCost = 65;
   let AllocationPriority = 8;
Index: llvm/trunk/test/CodeGen/AMDGPU/agpr-register-count.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/agpr-register-count.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/agpr-register-count.ll
@@ -1,15 +1,15 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x i32>, i32, i32, i32)
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
 
 ; GCN-LABEL: {{^}}test_32_agprs:
 ; GCN: v_mfma_f32_32x32x1f32 a[0:31], {{v[0-9]+}}, {{v[0-9]+}}, 0
 ; GCN-NOT: v28
 ; GCN: NumVgprs: 32
 ; GCN: VGPRBlocks: 7
-define amdgpu_kernel void @test_32_agprs(<32 x i32> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_32_agprs(<32 x float> addrspace(1)* %arg) {
 bb:
-  %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 0, i32 0, i32 0)
-  store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
   ret void
 }
Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -1,11 +1,11 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x i32>, i32, i32, i32)
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32)
 declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32)
 declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float, float, <16 x float>, i32, i32, i32)
 declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float, float, <4 x float>, i32, i32, i32)
-declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half>, <4 x half>, <32 x i32>, i32, i32, i32)
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half>, <4 x half>, <32 x float>, i32, i32, i32)
 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half>, <4 x half>, <16 x float>, i32, i32, i32)
 declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half>, <4 x half>, <4 x float>, i32, i32, i32)
 declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half>, <4 x half>, <16 x float>, i32, i32, i32)
@@ -15,7 +15,7 @@
 declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32)
 declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32, i32, <16 x i32>, i32, i32, i32)
 declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32, i32, <4 x i32>, i32, i32, i32)
-declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16>, <2 x i16>, <32 x i32>, i32, i32, i32)
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16>, <2 x i16>, <32 x float>, i32, i32, i32)
 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32)
 declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32)
 declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32)
@@ -100,11 +100,11 @@
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
-define amdgpu_kernel void @test_mfma_f32_32x32x1f32(<32 x i32> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_32x32x1f32(<32 x float> addrspace(1)* %arg) {
 bb:
-  %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg
-  %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> %in.1, i32 1, i32 2, i32 3)
-  store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg
+  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
   ret void
 }
 
@@ -326,14 +326,14 @@
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
-define amdgpu_kernel void @test_mfma_f32_32x32x4f16(<32 x i32> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) {
+define amdgpu_kernel void @test_mfma_f32_32x32x4f16(<32 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) {
 bb:
-  %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg
+  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
   %c.1 = load <4 x half>, <4 x half> addrspace(1)* %c
   %c2p = getelementptr <4 x half>, <4 x half> addrspace(1)* %c, i64 1
   %c.2 = load <4 x half>, <4 x half> addrspace(1)* %c2p
-  %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %c.1, <4 x half> %c.2, <32 x i32> %in.1, i32 1, i32 2, i32 3)
-  store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %c.1, <4 x half> %c.2, <32 x float> %in.1, i32 1, i32 2, i32 3)
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
   ret void
 }
 
@@ -794,13 +794,13 @@
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
-define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(<32 x i32> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(<32 x float> addrspace(1)* %arg) {
 bb:
-  %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg
+  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
   %a = bitcast i32 1 to <2 x i16>
   %b = bitcast i32 2 to <2 x i16>
-  %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x i32> %in.1, i32 1, i32 2, i32 3)
-  store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %in.1, i32 1, i32 2, i32 3)
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
   ret void
 }
 
@@ -957,12 +957,12 @@
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_forward_acc:
 ; GCN:      v_mfma_f32_32x32x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
 ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]]
-define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(<32 x i32> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(<32 x float> addrspace(1)* %arg) {
 bb:
-  %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg
-  %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> %in.1, i32 0, i32 0, i32 0)
-  %mai.2 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> %mai.1, i32 0, i32 0, i32 0)
-  store <32 x i32> %mai.2, <32 x i32> addrspace(1)* %arg
+  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
+  %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.1, i32 0, i32 0, i32 0)
+  store <32 x float> %mai.2, <32 x float> addrspace(1)* %arg
   ret void
 }
 
@@ -1112,10 +1112,10 @@
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
-define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(<32 x i32> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(<32 x float> addrspace(1)* %arg) {
 bb:
-  %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 0, i32 0, i32 0)
-  store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
   ret void
 }
 
@@ -1184,7 +1184,7 @@
 
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_imm:
 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1
+; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
@@ -1256,10 +1256,10 @@
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
-define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(<32 x i32> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(<32 x float> addrspace(1)* %arg) {
 bb:
-  %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> <i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 0, i32 0, i32 0)
-  store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
   ret void
 }
 
@@ -1350,12 +1350,12 @@
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
-define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(<32 x i32> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(<32 x float> addrspace(1)* %arg) {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <32 x i32>, <32 x i32> addrspace(1)* %arg, i32 %tid
-  %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %gep
-  %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> %in.1, i32 1, i32 2, i32 3)
-  store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %gep
+  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
+  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep
   ret void
 }
Index: llvm/trunk/test/CodeGen/AMDGPU/spill-agpr.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/spill-agpr.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/spill-agpr.ll
@@ -84,23 +84,23 @@
 ; A2M:        buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI]] ; 4-byte Folded Reload
 ; GFX908:     v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
 ; A2V:        ScratchSize: 0
-define amdgpu_kernel void @max_32regs_mfma32(i32 addrspace(1)* %arg) #3 {
+define amdgpu_kernel void @max_32regs_mfma32(float addrspace(1)* %arg) #3 {
 bb:
   %v = call i32 asm sideeffect "", "=a"()
   br label %use
 
 use:
-  %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 1.0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 2>, i32 0, i32 0, i32 0)
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 1.0, <32 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0, float 17.0, float 18.0, float 19.0, float 20.0, float 21.0, float 22.0, float 23.0, float 24.0, float 25.0, float 26.0, float 27.0, float 28.0, float 29.0, float 30.0, float 31.0, float 2.0>, i32 0, i32 0, i32 0)
   call void asm sideeffect "", "a"(i32 %v)
-  %elt1 = extractelement <32 x i32> %mai.1, i32 0
-  store i32 %elt1, i32 addrspace(1)* %arg
+  %elt1 = extractelement <32 x float> %mai.1, i32 0
+  store float %elt1, float addrspace(1)* %arg
   ret void
 }
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32)
 declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32)
-declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x i32>, i32, i32, i32)
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
 
 attributes #0 = { nounwind "amdgpu-num-vgpr"="24" }
 attributes #1 = { nounwind "amdgpu-num-vgpr"="8" }
Index: llvm/trunk/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
@@ -233,19 +233,23 @@
   ret void
 }
 
+; FIXME: adding an AReg_1024 register class for v32f32 and v32i32
+;        produces unnecessary copies and we still have some amount
+;        of conventional spilling.
+
 ; GCN-LABEL: {{^}}max_256_vgprs_spill_9x32_2bb:
 ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
 ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
-; GFX908-NOT: SCRATCH_RSRC
+; GFX908-FIXME-NOT: SCRATCH_RSRC
 ; GFX908-DAG: v_accvgpr_write_b32 a0, v
 ; GFX900:     buffer_store_dword v
 ; GFX900:     buffer_load_dword v
-; GFX908-NOT: buffer_
+; GFX908-FIXME-NOT: buffer_
 ; GFX908-DAG  v_accvgpr_read_b32
 
 ; GCN:    NumVgprs: 256
 ; GFX900: ScratchSize: 580
-; GFX908: ScratchSize: 0
+; GFX908-FIXME: ScratchSize: 0
 ; GCN:    VGPRBlocks: 63
 ; GCN:    NumVGPRsForWavesPerEU: 256
 define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(<32 x float> addrspace(1)* %p) {