Index: lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp =================================================================== --- lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp +++ lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp @@ -271,7 +271,8 @@ unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 12; const MCOperand &MO = MI.getOperand(OpNo); - assert(MO.isImm()); + assert(MO.isImm() && !(MO.getImm() % 16) && + "Expecting an immediate that is a multiple of 16"); return ((getMachineOpValue(MI, MO, Fixups, STI) >> 4) & 0xFFF) | RegBits; } Index: lib/Target/PowerPC/PPCISelDAGToDAG.cpp =================================================================== --- lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -305,6 +305,7 @@ bool AllUsersSelectZero(SDNode *N); void SwapAllSelectUsers(SDNode *N); + bool isOffsetMultipleOf(SDNode *N, unsigned Val) const; void transferMemOperands(SDNode *N, SDNode *Result); }; @@ -3012,6 +3013,25 @@ return get64BitZExtCompare(LHS, RHS, CC, RHSValue, dl); } +/// Does this node represent a load/store node whose address can be represented +/// with a register plus an immediate that's a multiple of \p Val: +bool PPCDAGToDAGISel::isOffsetMultipleOf(SDNode *N, unsigned Val) const { + LoadSDNode *LDN = dyn_cast(N); + StoreSDNode *STN = dyn_cast(N); + SDValue AddrOp; + if (LDN) + AddrOp = LDN->getOperand(1); + else if (STN) + AddrOp = STN->getOperand(2); + + short Imm = 0; + if (AddrOp.getOpcode() == ISD::ADD) + return isIntS16Immediate(AddrOp.getOperand(1), Imm) && !(Imm % Val); + + // If the address comes from the outside, the offset will be zero. + return AddrOp.getOpcode() == ISD::CopyFromReg; +} + void PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) { // Transfer memoperands. MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); Index: lib/Target/PowerPC/PPCInstrInfo.td =================================================================== --- lib/Target/PowerPC/PPCInstrInfo.td +++ lib/Target/PowerPC/PPCInstrInfo.td @@ -401,6 +401,25 @@ return cast(N)->getAlignment() < 4; }]>; +// This is a somewhat weaker condition than actually checking for 16-byte +// alignment. It is simply checking that the displacement can be represented +// as an immediate that is a multiple of 16 (i.e. the requirements for DQ-Form +// instructions). +def aligned16load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isOffsetMultipleOf(N, 16); +}]>; +def aligned16store : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return isOffsetMultipleOf(N, 16); +}]>; +def unaligned16load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return !isOffsetMultipleOf(N, 16); +}]>; +def unaligned16store : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return !isOffsetMultipleOf(N, 16); +}]>; + //===----------------------------------------------------------------------===// // PowerPC Flag Definitions. Index: lib/Target/PowerPC/PPCInstrVSX.td =================================================================== --- lib/Target/PowerPC/PPCInstrVSX.td +++ lib/Target/PowerPC/PPCInstrVSX.td @@ -2514,37 +2514,31 @@ } // IsLittleEndian, HasP9Vector // D-Form Load/Store - def : Pat<(v4i32 (load iaddr:$src)), (LXV memrix16:$src)>; - def : Pat<(v4f32 (load iaddr:$src)), (LXV memrix16:$src)>; - def : Pat<(v2i64 (load iaddr:$src)), (LXV memrix16:$src)>; - def : Pat<(v2f64 (load iaddr:$src)), (LXV memrix16:$src)>; - def : Pat<(v4i32 (int_ppc_vsx_lxvw4x iaddr:$src)), (LXV memrix16:$src)>; - def : Pat<(v2f64 (int_ppc_vsx_lxvd2x iaddr:$src)), (LXV memrix16:$src)>; - - def : Pat<(store v4f32:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>; - def : Pat<(store v4i32:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>; - def : Pat<(store v2f64:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>; - def : Pat<(store v2i64:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>; - def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, iaddr:$dst), - (STXV $rS, memrix16:$dst)>; - def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, iaddr:$dst), - (STXV $rS, memrix16:$dst)>; - - - def : Pat<(v2f64 (load xaddr:$src)), (LXVX xaddr:$src)>; - def : Pat<(v2i64 (load xaddr:$src)), (LXVX xaddr:$src)>; - def : Pat<(v4f32 (load xaddr:$src)), (LXVX xaddr:$src)>; - def : Pat<(v4i32 (load xaddr:$src)), (LXVX xaddr:$src)>; - def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xaddr:$src)), (LXVX xaddr:$src)>; - def : Pat<(v2f64 (int_ppc_vsx_lxvd2x xaddr:$src)), (LXVX xaddr:$src)>; - def : Pat<(store v2f64:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>; - def : Pat<(store v2i64:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>; - def : Pat<(store v4f32:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>; - def : Pat<(store v4i32:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>; - def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xaddr:$dst), - (STXVX $rS, xaddr:$dst)>; - def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xaddr:$dst), - (STXVX $rS, xaddr:$dst)>; + def : Pat<(v4i32 (aligned16load iaddr:$src)), (LXV memrix16:$src)>; + def : Pat<(v4f32 (aligned16load iaddr:$src)), (LXV memrix16:$src)>; + def : Pat<(v2i64 (aligned16load iaddr:$src)), (LXV memrix16:$src)>; + def : Pat<(v2f64 (aligned16load iaddr:$src)), (LXV memrix16:$src)>; + + def : Pat<(aligned16store v4f32:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>; + def : Pat<(aligned16store v4i32:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>; + def : Pat<(aligned16store v2f64:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>; + def : Pat<(aligned16store v2i64:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>; + + + def : Pat<(v2f64 (unaligned16load xoaddr:$src)), (LXVX xoaddr:$src)>; + def : Pat<(v2i64 (unaligned16load xoaddr:$src)), (LXVX xoaddr:$src)>; + def : Pat<(v4f32 (unaligned16load xoaddr:$src)), (LXVX xoaddr:$src)>; + def : Pat<(v4i32 (unaligned16load xoaddr:$src)), (LXVX xoaddr:$src)>; + def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xoaddr:$src)), (LXVX xoaddr:$src)>; + def : Pat<(v2f64 (int_ppc_vsx_lxvd2x xoaddr:$src)), (LXVX xoaddr:$src)>; + def : Pat<(unaligned16store v2f64:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>; + def : Pat<(unaligned16store v2i64:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>; + def : Pat<(unaligned16store v4f32:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>; + def : Pat<(unaligned16store v4i32:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>; + def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst), + (STXVX $rS, xoaddr:$dst)>; + def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst), + (STXVX $rS, xoaddr:$dst)>; def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))), (v4i32 (LXVWSX xoaddr:$src))>; def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))), Index: test/CodeGen/PowerPC/PR33671.ll =================================================================== --- test/CodeGen/PowerPC/PR33671.ll +++ test/CodeGen/PowerPC/PR33671.ll @@ -0,0 +1,32 @@ +; Function Attrs: norecurse nounwind +; RUN: llc -mtriple=powerpc64le-unknown-unknown -mcpu=pwr9 < %s | FileCheck %s +define void @test1(i32* nocapture readonly %arr, i32* nocapture %arrTo) { +entry: + %arrayidx = getelementptr inbounds i32, i32* %arrTo, i64 4 + %0 = bitcast i32* %arrayidx to <4 x i32>* + %arrayidx1 = getelementptr inbounds i32, i32* %arr, i64 4 + %1 = bitcast i32* %arrayidx1 to <4 x i32>* + %2 = load <4 x i32>, <4 x i32>* %1, align 16 + store <4 x i32> %2, <4 x i32>* %0, align 16 + ret void +; CHECK-LABEL: test1 +; CHECK: lxv [[LD:[0-9]+]], 16(3) +; CHECK: stxv [[LD]], 16(4) +} + +; Function Attrs: norecurse nounwind +define void @test2(i32* nocapture readonly %arr, i32* nocapture %arrTo) { +entry: + %arrayidx = getelementptr inbounds i32, i32* %arrTo, i64 1 + %0 = bitcast i32* %arrayidx to <4 x i32>* + %arrayidx1 = getelementptr inbounds i32, i32* %arr, i64 2 + %1 = bitcast i32* %arrayidx1 to <4 x i32>* + %2 = load <4 x i32>, <4 x i32>* %1, align 16 + store <4 x i32> %2, <4 x i32>* %0, align 16 + ret void +; CHECK-LABEL: test2 +; CHECK-DAG: li [[IMM1:[0-9]+]], 8 +; CHECK-DAG: li [[IMM2:[0-9]+]], 4 +; CHECK: lxvx [[LD:[0-9]+]], 3, [[IMM1]] +; CHECK: stxvx [[LD]], 4, [[IMM2]] +} Index: test/CodeGen/PowerPC/build-vector-tests.ll =================================================================== --- test/CodeGen/PowerPC/build-vector-tests.ll +++ test/CodeGen/PowerPC/build-vector-tests.ll @@ -1017,14 +1017,16 @@ ; P9LE-LABEL: fromDiffMemVarDi ; P8BE-LABEL: fromDiffMemVarDi ; P8LE-LABEL: fromDiffMemVarDi -; P9BE: sldi {{r[0-9]+}}, r4, 2 -; P9BE-DAG: lxv {{v[0-9]+}} -; P9BE-DAG: lxv +; P9BE-DAG: sldi {{r[0-9]+}}, r4, 2 +; P9BE-DAG: li [[IMM:r[0-9]+]], -12 +; P9BE-DAG: lxvx {{v[0-9]+}}, r3, [[IMM]] +; P9BE-DAG: lxvx ; P9BE: vperm ; P9BE: blr ; P9LE: sldi {{r[0-9]+}}, r4, 2 -; P9LE-DAG: lxv {{v[0-9]+}} -; P9LE-DAG: lxv +; P9LE-DAG: li [[IMM:r[0-9]+]], -12 +; P9LE-DAG: lxvx {{v[0-9]+}}, r3, [[IMM]] +; P9LE-DAG: lxvx ; P9LE: vperm ; P9LE: blr ; P8BE: sldi {{r[0-9]+}}, r4, 2 @@ -2177,12 +2179,14 @@ ; P8BE-LABEL: fromDiffMemVarDui ; P8LE-LABEL: fromDiffMemVarDui ; P9BE-DAG: sldi {{r[0-9]+}}, r4, 2 -; P9BE-DAG: lxv {{v[0-9]+}}, -12(r3) +; P9BE-DAG: li [[OFFSET:r[0-9]+]], -12 +; P9BE-DAG: lxvx {{v[0-9]+}}, r3, [[OFFSET]] ; P9BE-DAG: lxv ; P9BE: vperm ; P9BE: blr ; P9LE-DAG: sldi {{r[0-9]+}}, r4, 2 -; P9LE-DAG: lxv {{v[0-9]+}}, -12(r3) +; P9LE-DAG: li [[OFFSET:r[0-9]+]], -12 +; P9LE-DAG: lxvx {{v[0-9]+}}, r3, [[OFFSET]] ; P9LE-DAG: lxv ; P9LE: vperm ; P9LE: blr @@ -3466,9 +3470,9 @@ ; P9LE-LABEL: fromDiffConstsConvftoll ; P8BE-LABEL: fromDiffConstsConvftoll ; P8LE-LABEL: fromDiffConstsConvftoll -; P9BE: lxv v2 +; P9BE: lxvx v2 ; P9BE: blr -; P9LE: lxv v2 +; P9LE: lxvx v2 ; P9LE: blr ; P8BE: lxvd2x v2 ; P8BE: blr @@ -4370,9 +4374,9 @@ ; P9LE-LABEL: fromDiffConstsConvftoull ; P8BE-LABEL: fromDiffConstsConvftoull ; P8LE-LABEL: fromDiffConstsConvftoull -; P9BE: lxv v2 +; P9BE: lxvx v2 ; P9BE: blr -; P9LE: lxv v2 +; P9LE: lxvx v2 ; P9LE: blr ; P8BE: lxvd2x v2 ; P8BE: blr Index: test/CodeGen/PowerPC/ppc64-i128-abi.ll =================================================================== --- test/CodeGen/PowerPC/ppc64-i128-abi.ll +++ test/CodeGen/PowerPC/ppc64-i128-abi.ll @@ -63,7 +63,7 @@ ; FIXME: li [[R1:r[0-9]+]], 1 ; FIXME: li [[R2:r[0-9]+]], 0 ; FIXME: mtvsrdd [[V1:v[0-9]+]], [[R2]], [[R1]] -; CHECK-P9: lxv [[V1:v[0-9]+]] +; CHECK-P9: lxvx [[V1:v[0-9]+]] ; CHECK-P9: vadduqm v2, v2, [[V1]] ; CHECK-P9: blr @@ -237,8 +237,8 @@ ; CHECK-LE: blr ; CHECK-P9-LABEL: @call_v1i128_increment_by_val -; CHECK-P9-DAG: lxv v2 -; CHECK-P9-DAG: lxv v3 +; CHECK-P9-DAG: lxvx v2 +; CHECK-P9-DAG: lxvx v3 ; CHECK-P9: bl v1i128_increment_by_val ; CHECK-P9: blr Index: test/CodeGen/PowerPC/swaps-le-6.ll =================================================================== --- test/CodeGen/PowerPC/swaps-le-6.ll +++ test/CodeGen/PowerPC/swaps-le-6.ll @@ -33,11 +33,11 @@ ; CHECK: stxvd2x [[REG5]] ; CHECK-P9-LABEL: @bar0 -; CHECK-P9-DAG: lxv [[REG1:[0-9]+]] +; CHECK-P9-DAG: lxvx [[REG1:[0-9]+]] ; CHECK-P9-DAG: lfd [[REG2:[0-9]+]], 0(3) ; CHECK-P9: xxspltd [[REG4:[0-9]+]], [[REG2]], 0 ; CHECK-P9: xxpermdi [[REG5:[0-9]+]], [[REG1]], [[REG4]], 1 -; CHECK-P9: stxv [[REG5]] +; CHECK-P9: stxvx [[REG5]] define void @bar1() { entry: @@ -56,9 +56,9 @@ ; CHECK: stxvd2x [[REG5]] ; CHECK-P9-LABEL: @bar1 -; CHECK-P9-DAG: lxv [[REG1:[0-9]+]] +; CHECK-P9-DAG: lxvx [[REG1:[0-9]+]] ; CHECK-P9-DAG: lfd [[REG2:[0-9]+]], 0(3) ; CHECK-P9: xxspltd [[REG4:[0-9]+]], [[REG2]], 0 ; CHECK-P9: xxmrgld [[REG5:[0-9]+]], [[REG4]], [[REG1]] -; CHECK-P9: stxv [[REG5]] +; CHECK-P9: stxvx [[REG5]] Index: test/CodeGen/PowerPC/vsx-p9.ll =================================================================== --- test/CodeGen/PowerPC/vsx-p9.ll +++ test/CodeGen/PowerPC/vsx-p9.ll @@ -36,8 +36,8 @@ %1 = load <16 x i8>, <16 x i8>* @ucb, align 16 %add.i = add <16 x i8> %1, %0 tail call void (...) @sink(<16 x i8> %add.i) -; CHECK: lxv 34, 0(3) -; CHECK: lxv 35, 0(4) +; CHECK: lxvx 34, 0, 3 +; CHECK: lxvx 35, 0, 4 ; CHECK: vaddubm 2, 3, 2 ; CHECK: stxv 34, ; CHECK: bl sink @@ -45,8 +45,8 @@ %3 = load <16 x i8>, <16 x i8>* @scb, align 16 %add.i22 = add <16 x i8> %3, %2 tail call void (...) @sink(<16 x i8> %add.i22) -; CHECK: lxv 34, 0(3) -; CHECK: lxv 35, 0(4) +; CHECK: lxvx 34, 0, 3 +; CHECK: lxvx 35, 0, 4 ; CHECK: vaddubm 2, 3, 2 ; CHECK: stxv 34, ; CHECK: bl sink @@ -54,8 +54,8 @@ %5 = load <8 x i16>, <8 x i16>* @usb, align 16 %add.i21 = add <8 x i16> %5, %4 tail call void (...) @sink(<8 x i16> %add.i21) -; CHECK: lxv 34, 0(3) -; CHECK: lxv 35, 0(4) +; CHECK: lxvx 34, 0, 3 +; CHECK: lxvx 35, 0, 4 ; CHECK: vadduhm 2, 3, 2 ; CHECK: stxv 34, ; CHECK: bl sink @@ -63,8 +63,8 @@ %7 = load <8 x i16>, <8 x i16>* @ssb, align 16 %add.i20 = add <8 x i16> %7, %6 tail call void (...) @sink(<8 x i16> %add.i20) -; CHECK: lxv 34, 0(3) -; CHECK: lxv 35, 0(4) +; CHECK: lxvx 34, 0, 3 +; CHECK: lxvx 35, 0, 4 ; CHECK: vadduhm 2, 3, 2 ; CHECK: stxv 34, ; CHECK: bl sink @@ -72,8 +72,8 @@ %9 = load <4 x i32>, <4 x i32>* @uib, align 16 %add.i19 = add <4 x i32> %9, %8 tail call void (...) @sink(<4 x i32> %add.i19) -; CHECK: lxv 34, 0(3) -; CHECK: lxv 35, 0(4) +; CHECK: lxvx 34, 0, 3 +; CHECK: lxvx 35, 0, 4 ; CHECK: vadduwm 2, 3, 2 ; CHECK: stxv 34, ; CHECK: bl sink @@ -81,8 +81,8 @@ %11 = load <4 x i32>, <4 x i32>* @sib, align 16 %add.i18 = add <4 x i32> %11, %10 tail call void (...) @sink(<4 x i32> %add.i18) -; CHECK: lxv 34, 0(3) -; CHECK: lxv 35, 0(4) +; CHECK: lxvx 34, 0, 3 +; CHECK: lxvx 35, 0, 4 ; CHECK: vadduwm 2, 3, 2 ; CHECK: stxv 34, ; CHECK: bl sink @@ -90,8 +90,8 @@ %13 = load <2 x i64>, <2 x i64>* @ullb, align 16 %add.i17 = add <2 x i64> %13, %12 tail call void (...) @sink(<2 x i64> %add.i17) -; CHECK: lxv 34, 0(3) -; CHECK: lxv 35, 0(4) +; CHECK: lxvx 34, 0, 3 +; CHECK: lxvx 35, 0, 4 ; CHECK: vaddudm 2, 3, 2 ; CHECK: stxv 34, ; CHECK: bl sink @@ -99,8 +99,8 @@ %15 = load <2 x i64>, <2 x i64>* @sllb, align 16 %add.i16 = add <2 x i64> %15, %14 tail call void (...) @sink(<2 x i64> %add.i16) -; CHECK: lxv 34, 0(3) -; CHECK: lxv 35, 0(4) +; CHECK: lxvx 34, 0, 3 +; CHECK: lxvx 35, 0, 4 ; CHECK: vaddudm 2, 3, 2 ; CHECK: stxv 34, ; CHECK: bl sink @@ -108,8 +108,8 @@ %17 = load <1 x i128>, <1 x i128>* @uxb, align 16 %add.i15 = add <1 x i128> %17, %16 tail call void (...) @sink(<1 x i128> %add.i15) -; CHECK: lxv 34, 0(3) -; CHECK: lxv 35, 0(4) +; CHECK: lxvx 34, 0, 3 +; CHECK: lxvx 35, 0, 4 ; CHECK: vadduqm 2, 3, 2 ; CHECK: stxv 34, ; CHECK: bl sink @@ -117,8 +117,8 @@ %19 = load <1 x i128>, <1 x i128>* @sxb, align 16 %add.i14 = add <1 x i128> %19, %18 tail call void (...) @sink(<1 x i128> %add.i14) -; CHECK: lxv 34, 0(3) -; CHECK: lxv 35, 0(4) +; CHECK: lxvx 34, 0, 3 +; CHECK: lxvx 35, 0, 4 ; CHECK: vadduqm 2, 3, 2 ; CHECK: stxv 34, ; CHECK: bl sink @@ -126,8 +126,8 @@ %21 = load <4 x float>, <4 x float>* @vfb, align 16 %add.i13 = fadd <4 x float> %20, %21 tail call void (...) @sink(<4 x float> %add.i13) -; CHECK: lxv 0, 0(3) -; CHECK: lxv 1, 0(4) +; CHECK: lxvx 0, 0, 3 +; CHECK: lxvx 1, 0, 4 ; CHECK: xvaddsp 34, 0, 1 ; CHECK: stxv 34, ; CHECK: bl sink @@ -135,8 +135,8 @@ %23 = load <2 x double>, <2 x double>* @vdb, align 16 %add.i12 = fadd <2 x double> %22, %23 tail call void (...) @sink(<2 x double> %add.i12) -; CHECK: lxv 0, 0(3) -; CHECK: lxv 1, 0(4) +; CHECK: lxvx 0, 0, 3 +; CHECK: lxvx 1, 0, 4 ; CHECK: xvadddp 0, 0, 1 ; CHECK: stxv 0, ; CHECK: bl sink