diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -1590,16 +1590,32 @@ if (DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) { LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU(" << SUb->NodeNum << ")\n"); - // Copy successor edges from SUa to SUb. Interleaving computation - // dependent on SUa can prevent load combining due to register reuse. - // Predecessor edges do not need to be copied from SUb to SUa since - // nearby loads should have effectively the same inputs. - for (const SDep &Succ : SUa->Succs) { - if (Succ.getSUnit() == SUb) - continue; - LLVM_DEBUG(dbgs() - << " Copy Succ SU(" << Succ.getSUnit()->NodeNum << ")\n"); - DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial)); + if (IsLoad) { + // Copy successor edges from SUa to SUb. Interleaving computation + // dependent on SUa can prevent load combining due to register reuse. + // Predecessor edges do not need to be copied from SUb to SUa since + // nearby loads should have effectively the same inputs. + for (const SDep &Succ : SUa->Succs) { + if (Succ.getSUnit() == SUb) + continue; + LLVM_DEBUG(dbgs() << " Copy Succ SU(" << Succ.getSUnit()->NodeNum + << ")\n"); + DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial)); + } + } else { + // Copy predecessor edges from SUb to SUa to avoid the SUnits that + // SUb dependent on scheduled in-between SUb and SUa. Successor edges + // do not need to be copied from SUa to SUb since no one will depend + // on stores. + // Notice that, we don't need to care about the memory dependency as + // we won't try to cluster them if they have any memory dependency. + for (const SDep &Pred : SUb->Preds) { + if (Pred.getSUnit() == SUa) + continue; + LLVM_DEBUG(dbgs() << " Copy Pred SU(" << Pred.getSUnit()->NodeNum + << ")\n"); + DAG->addEdge(SUa, SDep(Pred.getSUnit(), SDep::Artificial)); + } } ++ClusterLength; } else diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -204,7 +204,6 @@ bool tryBitfieldInsert(SDNode *N); bool tryBitPermutation(SDNode *N); bool tryIntCompareInGPR(SDNode *N); - bool tryAndWithMask(SDNode *N); // tryTLSXFormLoad - Convert an ISD::LOAD fed by a PPCISD::ADD_TLS into // an X-Form load instruction with the offset being a relocation coming from @@ -343,6 +342,11 @@ private: bool trySETCC(SDNode *N); + bool tryAsSingleRLDICL(SDNode *N); + bool tryAsSingleRLDICR(SDNode *N); + bool tryAsSingleRLWINM(SDNode *N); + bool tryAsSingleRLWINM8(SDNode *N); + bool tryAsSingleRLWIMI(SDNode *N); void PeepholePPC64(); void PeepholePPC64ZExt(); @@ -4371,142 +4375,172 @@ return true; } -bool PPCDAGToDAGISel::tryAndWithMask(SDNode *N) { - if (N->getOpcode() != ISD::AND) +bool PPCDAGToDAGISel::tryAsSingleRLWINM(SDNode *N) { + assert(N->getOpcode() == ISD::AND && "ISD::AND SDNode expected"); + unsigned Imm; + if (!isInt32Immediate(N->getOperand(1), Imm)) return false; SDLoc dl(N); SDValue Val = N->getOperand(0); - unsigned Imm, Imm2, SH, MB, ME; - uint64_t Imm64; - + unsigned SH, MB, ME; // If this is an and of a value rotated between 0 and 31 bits and then and'd // with a mask, emit rlwinm - if (isInt32Immediate(N->getOperand(1), Imm) && - isRotateAndMask(N->getOperand(0).getNode(), Imm, false, SH, MB, ME)) { - SDValue Val = N->getOperand(0).getOperand(0); - SDValue Ops[] = { Val, getI32Imm(SH, dl), getI32Imm(MB, dl), - getI32Imm(ME, dl) }; + if (isRotateAndMask(Val.getNode(), Imm, false, SH, MB, ME)) { + Val = Val.getOperand(0); + SDValue Ops[] = {Val, getI32Imm(SH, dl), getI32Imm(MB, dl), + getI32Imm(ME, dl)}; CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); return true; } // If this is just a masked value where the input is not handled, and // is not a rotate-left (handled by a pattern in the .td file), emit rlwinm - if (isInt32Immediate(N->getOperand(1), Imm)) { - if (isRunOfOnes(Imm, MB, ME) && - N->getOperand(0).getOpcode() != ISD::ROTL) { - SDValue Ops[] = { Val, getI32Imm(0, dl), getI32Imm(MB, dl), - getI32Imm(ME, dl) }; - CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); - return true; - } - // AND X, 0 -> 0, not "rlwinm 32". - if (Imm == 0) { - ReplaceUses(SDValue(N, 0), N->getOperand(1)); - return true; - } + if (isRunOfOnes(Imm, MB, ME) && Val.getOpcode() != ISD::ROTL) { + SDValue Ops[] = {Val, getI32Imm(0, dl), getI32Imm(MB, dl), + getI32Imm(ME, dl)}; + CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); + return true; + } - // ISD::OR doesn't get all the bitfield insertion fun. - // (and (or x, c1), c2) where isRunOfOnes(~(c1^c2)) might be a - // bitfield insert. - if (N->getOperand(0).getOpcode() == ISD::OR && - isInt32Immediate(N->getOperand(0).getOperand(1), Imm2)) { - // The idea here is to check whether this is equivalent to: - // (c1 & m) | (x & ~m) - // where m is a run-of-ones mask. The logic here is that, for each bit in - // c1 and c2: - // - if both are 1, then the output will be 1. - // - if both are 0, then the output will be 0. - // - if the bit in c1 is 0, and the bit in c2 is 1, then the output will - // come from x. - // - if the bit in c1 is 1, and the bit in c2 is 0, then the output will - // be 0. - // If that last condition is never the case, then we can form m from the - // bits that are the same between c1 and c2. - unsigned MB, ME; - if (isRunOfOnes(~(Imm^Imm2), MB, ME) && !(~Imm & Imm2)) { - SDValue Ops[] = { N->getOperand(0).getOperand(0), - N->getOperand(0).getOperand(1), - getI32Imm(0, dl), getI32Imm(MB, dl), - getI32Imm(ME, dl) }; - ReplaceNode(N, CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops)); - return true; - } - } - } else if (isInt64Immediate(N->getOperand(1).getNode(), Imm64)) { - // If this is a 64-bit zero-extension mask, emit rldicl. - if (isMask_64(Imm64)) { - MB = 64 - countTrailingOnes(Imm64); - SH = 0; - - if (Val.getOpcode() == ISD::ANY_EXTEND) { - auto Op0 = Val.getOperand(0); - if ( Op0.getOpcode() == ISD::SRL && - isInt32Immediate(Op0.getOperand(1).getNode(), Imm) && Imm <= MB) { - - auto ResultType = Val.getNode()->getValueType(0); - auto ImDef = CurDAG->getMachineNode(PPC::IMPLICIT_DEF, dl, - ResultType); - SDValue IDVal (ImDef, 0); - - Val = SDValue(CurDAG->getMachineNode(PPC::INSERT_SUBREG, dl, - ResultType, IDVal, Op0.getOperand(0), - getI32Imm(1, dl)), 0); - SH = 64 - Imm; - } - } + // AND X, 0 -> 0, not "rlwinm 32". + if (Imm == 0) { + ReplaceUses(SDValue(N, 0), N->getOperand(1)); + return true; + } - // If the operand is a logical right shift, we can fold it into this - // instruction: rldicl(rldicl(x, 64-n, n), 0, mb) -> rldicl(x, 64-n, mb) - // for n <= mb. The right shift is really a left rotate followed by a - // mask, and this mask is a more-restrictive sub-mask of the mask implied - // by the shift. - if (Val.getOpcode() == ISD::SRL && - isInt32Immediate(Val.getOperand(1).getNode(), Imm) && Imm <= MB) { - assert(Imm < 64 && "Illegal shift amount"); - Val = Val.getOperand(0); - SH = 64 - Imm; - } + return false; +} - SDValue Ops[] = { Val, getI32Imm(SH, dl), getI32Imm(MB, dl) }; - CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops); - return true; - } else if (isMask_64(~Imm64)) { - // If this is a negated 64-bit zero-extension mask, - // i.e. the immediate is a sequence of ones from most significant side - // and all zero for reminder, we should use rldicr. - MB = 63 - countTrailingOnes(~Imm64); - SH = 0; - SDValue Ops[] = { Val, getI32Imm(SH, dl), getI32Imm(MB, dl) }; - CurDAG->SelectNodeTo(N, PPC::RLDICR, MVT::i64, Ops); - return true; - } +bool PPCDAGToDAGISel::tryAsSingleRLWINM8(SDNode *N) { + assert(N->getOpcode() == ISD::AND && "ISD::AND SDNode expected"); + uint64_t Imm64; + if (!isInt64Immediate(N->getOperand(1).getNode(), Imm64)) + return false; - // It is not 16-bit imm that means we need two instructions at least if - // using "and" instruction. Try to exploit it with rotate mask instructions. - if (isRunOfOnes64(Imm64, MB, ME)) { - if (MB >= 32 && MB <= ME) { - // MB ME - // +----------------------+ - // |xxxxxxxxxxx00011111000| - // +----------------------+ - // 0 32 64 - // We can only do it if the MB is larger than 32 and MB <= ME - // as RLWINM will replace the content of [0 - 32) with [32 - 64) even - // we didn't rotate it. - SDValue Ops[] = { Val, getI64Imm(0, dl), getI64Imm(MB - 32, dl), - getI64Imm(ME - 32, dl) }; - CurDAG->SelectNodeTo(N, PPC::RLWINM8, MVT::i64, Ops); - return true; - } - // TODO - handle it with rldicl + rldicl - } + unsigned MB, ME; + if (isRunOfOnes64(Imm64, MB, ME) && MB >= 32 && MB <= ME) { + // MB ME + // +----------------------+ + // |xxxxxxxxxxx00011111000| + // +----------------------+ + // 0 32 64 + // We can only do it if the MB is larger than 32 and MB <= ME + // as RLWINM will replace the contents of [0 - 32) with [32 - 64) even + // we didn't rotate it. + SDLoc dl(N); + SDValue Ops[] = {N->getOperand(0), getI64Imm(0, dl), getI64Imm(MB - 32, dl), + getI64Imm(ME - 32, dl)}; + CurDAG->SelectNodeTo(N, PPC::RLWINM8, MVT::i64, Ops); + return true; + } + + return false; +} + +bool PPCDAGToDAGISel::tryAsSingleRLWIMI(SDNode *N) { + assert(N->getOpcode() == ISD::AND && "ISD::AND SDNode expected"); + unsigned Imm; + if (!isInt32Immediate(N->getOperand(1), Imm)) + return false; + + SDValue Val = N->getOperand(0); + unsigned Imm2; + // ISD::OR doesn't get all the bitfield insertion fun. + // (and (or x, c1), c2) where isRunOfOnes(~(c1^c2)) might be a + // bitfield insert. + if (Val.getOpcode() != ISD::OR || !isInt32Immediate(Val.getOperand(1), Imm2)) + return false; + + // The idea here is to check whether this is equivalent to: + // (c1 & m) | (x & ~m) + // where m is a run-of-ones mask. The logic here is that, for each bit in + // c1 and c2: + // - if both are 1, then the output will be 1. + // - if both are 0, then the output will be 0. + // - if the bit in c1 is 0, and the bit in c2 is 1, then the output will + // come from x. + // - if the bit in c1 is 1, and the bit in c2 is 0, then the output will + // be 0. + // If that last condition is never the case, then we can form m from the + // bits that are the same between c1 and c2. + unsigned MB, ME; + if (isRunOfOnes(~(Imm ^ Imm2), MB, ME) && !(~Imm & Imm2)) { + SDLoc dl(N); + SDValue Ops[] = {Val.getOperand(0), Val.getOperand(1), getI32Imm(0, dl), + getI32Imm(MB, dl), getI32Imm(ME, dl)}; + ReplaceNode(N, CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops)); + return true; } return false; } +bool PPCDAGToDAGISel::tryAsSingleRLDICL(SDNode *N) { + assert(N->getOpcode() == ISD::AND && "ISD::AND SDNode expected"); + uint64_t Imm64; + if (!isInt64Immediate(N->getOperand(1).getNode(), Imm64) || !isMask_64(Imm64)) + return false; + + // If this is a 64-bit zero-extension mask, emit rldicl. + unsigned MB = 64 - countTrailingOnes(Imm64); + unsigned SH = 0; + unsigned Imm; + SDValue Val = N->getOperand(0); + SDLoc dl(N); + + if (Val.getOpcode() == ISD::ANY_EXTEND) { + auto Op0 = Val.getOperand(0); + if (Op0.getOpcode() == ISD::SRL && + isInt32Immediate(Op0.getOperand(1).getNode(), Imm) && Imm <= MB) { + + auto ResultType = Val.getNode()->getValueType(0); + auto ImDef = CurDAG->getMachineNode(PPC::IMPLICIT_DEF, dl, ResultType); + SDValue IDVal(ImDef, 0); + + Val = SDValue(CurDAG->getMachineNode(PPC::INSERT_SUBREG, dl, ResultType, + IDVal, Op0.getOperand(0), + getI32Imm(1, dl)), + 0); + SH = 64 - Imm; + } + } + + // If the operand is a logical right shift, we can fold it into this + // instruction: rldicl(rldicl(x, 64-n, n), 0, mb) -> rldicl(x, 64-n, mb) + // for n <= mb. The right shift is really a left rotate followed by a + // mask, and this mask is a more-restrictive sub-mask of the mask implied + // by the shift. + if (Val.getOpcode() == ISD::SRL && + isInt32Immediate(Val.getOperand(1).getNode(), Imm) && Imm <= MB) { + assert(Imm < 64 && "Illegal shift amount"); + Val = Val.getOperand(0); + SH = 64 - Imm; + } + + SDValue Ops[] = {Val, getI32Imm(SH, dl), getI32Imm(MB, dl)}; + CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops); + return true; +} + +bool PPCDAGToDAGISel::tryAsSingleRLDICR(SDNode *N) { + assert(N->getOpcode() == ISD::AND && "ISD::AND SDNode expected"); + uint64_t Imm64; + if (!isInt64Immediate(N->getOperand(1).getNode(), Imm64) || + !isMask_64(~Imm64)) + return false; + + // If this is a negated 64-bit zero-extension mask, + // i.e. the immediate is a sequence of ones from most significant side + // and all zero for reminder, we should use rldicr. + unsigned MB = 63 - countTrailingOnes(~Imm64); + unsigned SH = 0; + SDLoc dl(N); + SDValue Ops[] = {N->getOperand(0), getI32Imm(SH, dl), getI32Imm(MB, dl)}; + CurDAG->SelectNodeTo(N, PPC::RLDICR, MVT::i64, Ops); + return true; +} + // Select - Convert the specified operand from a target-independent to a // target-specific node if it hasn't already been changed. void PPCDAGToDAGISel::Select(SDNode *N) { @@ -4730,7 +4764,8 @@ case ISD::AND: // If this is an 'and' with a mask, try to emit rlwinm/rldicl/rldicr - if (tryAndWithMask(N)) + if (tryAsSingleRLWINM(N) || tryAsSingleRLWIMI(N) || tryAsSingleRLDICL(N) || + tryAsSingleRLDICR(N) || tryAsSingleRLWINM8(N)) return; // Other cases are autogenerated. diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -365,26 +365,26 @@ ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:5 -; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:6 -; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:1 -; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:2 -; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:3 -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:4 +; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1 +; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2 +; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:3 +; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:4 +; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:5 +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:6 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v7, v3 -; SI-NEXT: v_or_b32_e32 v3, v5, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v7, v0 +; SI-NEXT: v_or_b32_e32 v0, v3, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v5, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v0 ; SI-NEXT: v_or_b32_e32 v0, v2, v1 diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -307,6 +307,7 @@ ; GCN: flat_load_dwordx4 ; GCN: flat_load_dwordx4 +; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 @@ -315,7 +316,6 @@ ; GCN: flat_store_dwordx4 -; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 diff --git a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll --- a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll +++ b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll @@ -173,9 +173,9 @@ ; GCN-NOHSA-DAG: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}} ; CI-NOHSA-DAG: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}} ; CI-NOHSA-NOT: v_add -; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16 ; CI-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}} ; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}} +; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -135,10 +135,9 @@ ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: s_bfe_u32 s0, s0, 0x10010 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_mov_b32 s4, 2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: widen_i17_constant_load: