Index: include/llvm/IR/IntrinsicsPowerPC.td =================================================================== --- include/llvm/IR/IntrinsicsPowerPC.td +++ include/llvm/IR/IntrinsicsPowerPC.td @@ -516,6 +516,18 @@ let TargetPrefix = "ppc" in { // All intrinsics start with "llvm.ppc.". +// Vector load. +def int_ppc_vsx_lxvw4x : + Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty], [IntrReadArgMem]>; +def int_ppc_vsx_lxvd2x : + Intrinsic<[llvm_v2f64_ty], [llvm_ptr_ty], [IntrReadArgMem]>; + +// Vector store. +def int_ppc_vsx_stxvw4x : + Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty], [IntrReadWriteArgMem]>; +def int_ppc_vsx_stxvd2x : + Intrinsic<[], [llvm_v2f64_ty, llvm_ptr_ty], [IntrReadWriteArgMem]>; + // Vector maximum. def int_ppc_vsx_xvmaxdp : PowerPC_VSX_Vec_DDD_Intrinsic<"xvmaxdp">; def int_ppc_vsx_xvmaxsp : PowerPC_VSX_Vec_FFF_Intrinsic<"xvmaxsp">; Index: lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- lib/Target/PowerPC/PPCISelLowering.cpp +++ lib/Target/PowerPC/PPCISelLowering.cpp @@ -7567,8 +7567,12 @@ default: return false; case Intrinsic::ppc_altivec_lvx: case Intrinsic::ppc_altivec_lvxl: + case Intrinsic::ppc_vsx_lxvw4x: VT = MVT::v4i32; break; + case Intrinsic::ppc_vsx_lxvd2x: + VT = MVT::v2f64; + break; case Intrinsic::ppc_altivec_lvebx: VT = MVT::i8; break; @@ -7589,8 +7593,12 @@ default: return false; case Intrinsic::ppc_altivec_stvx: case Intrinsic::ppc_altivec_stvxl: + case Intrinsic::ppc_vsx_stxvw4x: VT = MVT::v4i32; break; + case Intrinsic::ppc_vsx_stxvd2x: + VT = MVT::v2f64; + break; case Intrinsic::ppc_altivec_stvebx: VT = MVT::i8; break; @@ -9078,7 +9086,9 @@ case Intrinsic::ppc_altivec_lvxl: case Intrinsic::ppc_altivec_lvebx: case Intrinsic::ppc_altivec_lvehx: - case Intrinsic::ppc_altivec_lvewx: { + case Intrinsic::ppc_altivec_lvewx: + case Intrinsic::ppc_vsx_lxvd2x: + case Intrinsic::ppc_vsx_lxvw4x: { EVT VT; switch (Intrinsic) { case Intrinsic::ppc_altivec_lvebx: @@ -9090,6 +9100,9 @@ case Intrinsic::ppc_altivec_lvewx: VT = MVT::i32; break; + case Intrinsic::ppc_vsx_lxvd2x: + VT = MVT::v2f64; + break; default: VT = MVT::v4i32; break; @@ -9110,7 +9123,9 @@ case Intrinsic::ppc_altivec_stvxl: case Intrinsic::ppc_altivec_stvebx: case Intrinsic::ppc_altivec_stvehx: - case Intrinsic::ppc_altivec_stvewx: { + case Intrinsic::ppc_altivec_stvewx: + case Intrinsic::ppc_vsx_stxvd2x: + case Intrinsic::ppc_vsx_stxvw4x: { EVT VT; switch (Intrinsic) { case Intrinsic::ppc_altivec_stvebx: @@ -9122,6 +9137,9 @@ case Intrinsic::ppc_altivec_stvewx: VT = MVT::i32; break; + case Intrinsic::ppc_vsx_stxvd2x: + VT = MVT::v2f64; + break; default: VT = MVT::v4i32; break; Index: lib/Target/PowerPC/PPCInstrVSX.td =================================================================== --- lib/Target/PowerPC/PPCInstrVSX.td +++ lib/Target/PowerPC/PPCInstrVSX.td @@ -55,7 +55,7 @@ def LXVD2X : XX1Form<31, 844, (outs vsrc:$XT), (ins memrr:$src), "lxvd2x $XT, $src", IIC_LdStLFD, - [(set v2f64:$XT, (load xoaddr:$src))]>; + [(set v2f64:$XT, (int_ppc_vsx_lxvd2x xoaddr:$src))]>; def LXVDSX : XX1Form<31, 332, (outs vsrc:$XT), (ins memrr:$src), @@ -64,7 +64,7 @@ def LXVW4X : XX1Form<31, 780, (outs vsrc:$XT), (ins memrr:$src), "lxvw4x $XT, $src", IIC_LdStLFD, - [(set v4i32:$XT, (load xoaddr:$src))]>; + [(set v4i32:$XT, (int_ppc_vsx_lxvw4x xoaddr:$src))]>; } // Store indexed instructions @@ -77,12 +77,12 @@ def STXVD2X : XX1Form<31, 972, (outs), (ins vsrc:$XT, memrr:$dst), "stxvd2x $XT, $dst", IIC_LdStSTFD, - [(store v2f64:$XT, xoaddr:$dst)]>; + [(int_ppc_vsx_stxvd2x v2f64:$XT, xoaddr:$dst)]>; def STXVW4X : XX1Form<31, 908, (outs), (ins vsrc:$XT, memrr:$dst), "stxvw4x $XT, $dst", IIC_LdStSTFD, - [(store v4i32:$XT, xoaddr:$dst)]>; + [(int_ppc_vsx_stxvw4x v4i32:$XT, xoaddr:$dst)]>; } // Add/Mul Instructions @@ -851,11 +851,14 @@ (XVCVSXWDP (XXSLDWI $C, $C, 1))>; // Loads. +def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>; +def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>; def : Pat<(v4i32 (load xoaddr:$src)), (LXVW4X xoaddr:$src)>; // Stores. -def : Pat<(store v4i32:$rS, xoaddr:$dst), - (STXVW4X $rS, xoaddr:$dst)>; +def : Pat<(store v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>; +def : Pat<(store v2i64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>; +def : Pat<(store v4i32:$rS, xoaddr:$dst), (STXVW4X $rS, xoaddr:$dst)>; // Selects. def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETLT)), Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -613,6 +613,13 @@ return new LoadInst(Ptr); } break; + case Intrinsic::ppc_vsx_lxvw4x: + case Intrinsic::ppc_vsx_lxvd2x: { + // Turn PPC VSX loads into normal loads. + Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), + PointerType::getUnqual(II->getType())); + return new LoadInst(Ptr); + } case Intrinsic::ppc_altivec_stvx: case Intrinsic::ppc_altivec_stvxl: // Turn stvx -> store if the pointer is known aligned. @@ -624,6 +631,13 @@ return new StoreInst(II->getArgOperand(0), Ptr); } break; + case Intrinsic::ppc_vsx_stxvw4x: + case Intrinsic::ppc_vsx_stxvd2x: { + // Turn PPC VSX stores into normal stores. + Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType()); + Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy); + return new StoreInst(II->getArgOperand(0), Ptr); + } case Intrinsic::x86_sse_storeu_ps: case Intrinsic::x86_sse2_storeu_pd: case Intrinsic::x86_sse2_storeu_dq: Index: test/CodeGen/PowerPC/vsx-fma-m.ll =================================================================== --- test/CodeGen/PowerPC/vsx-fma-m.ll +++ test/CodeGen/PowerPC/vsx-fma-m.ll @@ -177,21 +177,27 @@ store <2 x double> %1, <2 x double>* %arrayidx3, align 8 ret void +; Note: There is some unavoidable changeability in this variant. If the +; FMAs are reordered differently, the algorithm can pick a different +; multiplicand to destroy, changing the register assignment. There isn't +; a good way to express this possibility, so hopefully this doesn't change +; too often. + ; CHECK-LABEL: @testv3 ; CHECK-DAG: xxlor [[V1:[0-9]+]], 34, 34 -; CHECK-DAG: xvmaddmdp 37, 35, 34 ; CHECK-DAG: li [[C1:[0-9]+]], 48 ; CHECK-DAG: li [[C2:[0-9]+]], 32 -; CHECK-DAG: xvmaddadp 34, 35, 38 +; CHECK-DAG: xvmaddmdp 37, 35, 34 ; CHECK-DAG: li [[C3:[0-9]+]], 16 ; Note: We could convert this next FMA to M-type as well, but it would require ; re-ordering the instructions. ; CHECK-DAG: xvmaddadp [[V1]], 35, 36 -; CHECK-DAG: xvmaddmdp 35, 36, 37 +; CHECK-DAG: xvmaddmdp 36, 35, 37 +; CHECK-DAG: xvmaddadp 34, 35, 38 ; CHECK-DAG: stxvd2x 32, 0, 3 -; CHECK-DAG: stxvd2x 35, 3, [[C1]] +; CHECK-DAG: stxvd2x 36, 3, [[C1]] ; CHECK-DAG: stxvd2x 34, 3, [[C2]] ; CHECK-DAG: stxvd2x 37, 3, [[C3]] ; CHECK: blr Index: test/CodeGen/PowerPC/vsx-ldst.ll =================================================================== --- test/CodeGen/PowerPC/vsx-ldst.ll +++ test/CodeGen/PowerPC/vsx-ldst.ll @@ -0,0 +1,36 @@ +; RUN: llc -mcpu=pwr8 -mattr=+vsx -O2 -mtriple=powerpc64-unknown-linux-gnu < %s > %t +; RUN: grep lxvw4x < %t | count 3 +; RUN: grep lxvd2x < %t | count 3 +; RUN: grep stxvw4x < %t | count 3 +; RUN: grep stxvd2x < %t | count 3 + +@vsi = global <4 x i32> , align 16 +@vui = global <4 x i32> , align 16 +@vf = global <4 x float> , align 16 +@vsll = global <2 x i64> , align 16 +@vull = global <2 x i64> , align 16 +@vd = global <2 x double> , align 16 +@res_vsi = common global <4 x i32> zeroinitializer, align 16 +@res_vui = common global <4 x i32> zeroinitializer, align 16 +@res_vf = common global <4 x float> zeroinitializer, align 16 +@res_vsll = common global <2 x i64> zeroinitializer, align 16 +@res_vull = common global <2 x i64> zeroinitializer, align 16 +@res_vd = common global <2 x double> zeroinitializer, align 16 + +; Function Attrs: nounwind +define void @test1() { +entry: + %0 = load <4 x i32>* @vsi, align 16 + %1 = load <4 x i32>* @vui, align 16 + %2 = load <4 x i32>* bitcast (<4 x float>* @vf to <4 x i32>*), align 16 + %3 = load <2 x double>* bitcast (<2 x i64>* @vsll to <2 x double>*), align 16 + %4 = load <2 x double>* bitcast (<2 x i64>* @vull to <2 x double>*), align 16 + %5 = load <2 x double>* @vd, align 16 + store <4 x i32> %0, <4 x i32>* @res_vsi, align 16 + store <4 x i32> %1, <4 x i32>* @res_vui, align 16 + store <4 x i32> %2, <4 x i32>* bitcast (<4 x float>* @res_vf to <4 x i32>*), align 16 + store <2 x double> %3, <2 x double>* bitcast (<2 x i64>* @res_vsll to <2 x double>*), align 16 + store <2 x double> %4, <2 x double>* bitcast (<2 x i64>* @res_vull to <2 x double>*), align 16 + store <2 x double> %5, <2 x double>* @res_vd, align 16 + ret void +}