Index: lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstructions.td +++ lib/Target/AMDGPU/AMDGPUInstructions.td @@ -135,6 +135,12 @@ // Misc. PatFrags //===----------------------------------------------------------------------===// +class HasOneUseUnaryOp : PatFrag< + (ops node:$src0), + (op $src0), + [{ return N->hasOneUse(); }] +>; + class HasOneUseBinOp : PatFrag< (ops node:$src0, node:$src1), (op $src0, $src1), @@ -165,6 +171,8 @@ def xor_oneuse : HasOneUseBinOp; } // Properties = [SDNPCommutative, SDNPAssociative] +def not_oneuse : HasOneUseUnaryOp; + def add_oneuse : HasOneUseBinOp; def sub_oneuse : HasOneUseBinOp; Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -89,12 +89,20 @@ void lowerScalarXnor(SetVectorType &Worklist, MachineInstr &Inst) const; + void splitScalarNotBinop(SetVectorType &Worklist, + MachineInstr &Inst, + unsigned Opcode) const; + + void splitScalarBinOpN2(SetVectorType &Worklist, + MachineInstr &Inst, + unsigned Opcode) const; + void splitScalar64BitUnaryOp(SetVectorType &Worklist, MachineInstr &Inst, unsigned Opcode) const; void splitScalar64BitAddSub(SetVectorType &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT = nullptr) const; - + void splitScalar64BitBinaryOp(SetVectorType &Worklist, MachineInstr &Inst, unsigned Opcode, MachineDominatorTree *MDT = nullptr) const; Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3121,6 +3121,8 @@ case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; + case AMDGPU::S_XNOR_B32: + return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END; case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64; case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64; case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64; @@ -4088,22 +4090,47 @@ // Default handling break; case AMDGPU::S_AND_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64, MDT); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT); Inst.eraseFromParent(); continue; case AMDGPU::S_OR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64, MDT); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT); Inst.eraseFromParent(); continue; case AMDGPU::S_XOR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64, MDT); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT); + Inst.eraseFromParent(); + continue; + + case AMDGPU::S_NAND_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT); + Inst.eraseFromParent(); + continue; + + case AMDGPU::S_NOR_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT); + Inst.eraseFromParent(); + continue; + + case AMDGPU::S_XNOR_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); + Inst.eraseFromParent(); + continue; + + case AMDGPU::S_ANDN2_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT); + Inst.eraseFromParent(); + continue; + + case AMDGPU::S_ORN2_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT); Inst.eraseFromParent(); continue; case AMDGPU::S_NOT_B64: - splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32); + splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); Inst.eraseFromParent(); continue; @@ -4184,8 +4211,23 @@ Inst.eraseFromParent(); continue; - case AMDGPU::S_XNOR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); + case AMDGPU::S_NAND_B32: + splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32); + Inst.eraseFromParent(); + continue; + + case AMDGPU::S_NOR_B32: + splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32); + Inst.eraseFromParent(); + continue; + + case AMDGPU::S_ANDN2_B32: + splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32); + Inst.eraseFromParent(); + continue; + + case AMDGPU::S_ORN2_B32: + splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32); Inst.eraseFromParent(); continue; @@ -4471,23 +4513,116 @@ MachineOperand &Src0 = Inst.getOperand(1); MachineOperand &Src1 = Inst.getOperand(2); - legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL); - legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL); - - unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); if (ST.hasDLInsts()) { + unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL); + legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL); + BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest) .add(Src0) .add(Src1); + + MRI.replaceRegWith(Dest.getReg(), NewDest); + addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); } else { - unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor) - .add(Src0) + // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can + // invert either source and then perform the XOR. If either source is a + // scalar register, then we can leave the inversion on the scalar unit to + // acheive a better distrubution of scalar and vector instructions. + bool Src0IsSGPR = Src0.isReg() && + MRI.getRegClass(Src0.getReg()) == &AMDGPU::SGPR_32RegClass; + bool Src1IsSGPR = Src1.isReg() && + MRI.getRegClass(Src1.getReg()) == &AMDGPU::SGPR_32RegClass; + MachineInstr *Not = nullptr; + MachineInstr *Xor = nullptr; + unsigned Temp = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + + // Build a pair of scalar instructions and add them to the work list. + // The next iteration over the work list will lower these to the vector + // unit as necessary. + if (Src0IsSGPR) { + Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp) + .add(Src0); + Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) + .addReg(Temp) .add(Src1); + } else if (Src1IsSGPR) { + Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp) + .add(Src1); + Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) + .add(Src0) + .addReg(Temp); + } else { + Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp) + .add(Src0) + .add(Src1); + Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest) + .addReg(Temp); + Worklist.insert(Not); + } + + MRI.replaceRegWith(Dest.getReg(), NewDest); + + Worklist.insert(Xor); - BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), NewDest) - .addReg(Xor); + addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); } +} + +void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist, + MachineInstr &Inst, + unsigned Opcode) const { + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineBasicBlock::iterator MII = Inst; + const DebugLoc &DL = Inst.getDebugLoc(); + + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + + unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + + MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm) + .add(Src0) + .add(Src1); + + MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest) + .addReg(Interm); + + Worklist.insert(&Op); + Worklist.insert(&Not); + + MRI.replaceRegWith(Dest.getReg(), NewDest); + addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); +} + +void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist, + MachineInstr &Inst, + unsigned Opcode) const { + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineBasicBlock::iterator MII = Inst; + const DebugLoc &DL = Inst.getDebugLoc(); + + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + + unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + + MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm) + .add(Src1); + + MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest) + .add(Src0) + .addReg(Interm); + + Worklist.insert(&Not); + Worklist.insert(&Op); MRI.replaceRegWith(Dest.getReg(), NewDest); addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); @@ -4520,13 +4655,13 @@ const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); - BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); + MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); - BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); + MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) @@ -4537,6 +4672,9 @@ MRI.replaceRegWith(Dest.getReg(), FullDestReg); + Worklist.insert(&LoHalf); + Worklist.insert(&HiHalf); + // We don't need to legalizeOperands here because for a single operand, src0 // will support any kind of input. @@ -4642,6 +4780,10 @@ AMDGPU::sub0, Src0SubRC); MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); + MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, + AMDGPU::sub1, Src0SubRC); + MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, + AMDGPU::sub1, Src1SubRC); const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); @@ -4652,11 +4794,6 @@ .add(SrcReg0Sub0) .add(SrcReg1Sub0); - MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, - AMDGPU::sub1, Src0SubRC); - MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, - AMDGPU::sub1, Src1SubRC); - unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) .add(SrcReg0Sub1) @@ -4671,10 +4808,8 @@ MRI.replaceRegWith(Dest.getReg(), FullDestReg); - // Try to legalize the operands in case we need to swap the order to keep it - // valid. - legalizeOperands(LoHalf, MDT); - legalizeOperands(HiHalf, MDT); + Worklist.insert(&LoHalf); + Worklist.insert(&HiHalf); // Move all users of this moved vlaue. addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); Index: lib/Target/AMDGPU/SOPInstructions.td =================================================================== --- lib/Target/AMDGPU/SOPInstructions.td +++ lib/Target/AMDGPU/SOPInstructions.td @@ -336,6 +336,12 @@ "$sdst, $src0, $src1", pattern >; +class UniformUnaryFrag : PatFrag < + (ops node:$src0), + (Op $src0), + [{ return !N->isDivergent(); }] +>; + class UniformBinFrag : PatFrag < (ops node:$src0, node:$src1), (Op $src0, $src1), @@ -421,16 +427,39 @@ def S_XNOR_B64 : SOP2_64 <"s_xnor_b64", [(set i64:$sdst, (not (xor_oneuse i64:$src0, i64:$src1)))] >; + +def S_NAND_B32 : SOP2_32 <"s_nand_b32", + [(set i32:$sdst, (not (and_oneuse i32:$src0, i32:$src1)))] +>; + +def S_NAND_B64 : SOP2_64 <"s_nand_b64", + [(set i64:$sdst, (not (and_oneuse i64:$src0, i64:$src1)))] +>; + +def S_NOR_B32 : SOP2_32 <"s_nor_b32", + [(set i32:$sdst, (not (or_oneuse i32:$src0, i32:$src1)))] +>; + +def S_NOR_B64 : SOP2_64 <"s_nor_b64", + [(set i64:$sdst, (not (or_oneuse i64:$src0, i64:$src1)))] +>; } // End isCommutable = 1 -def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32">; -def S_ANDN2_B64 : SOP2_64 <"s_andn2_b64">; -def S_ORN2_B32 : SOP2_32 <"s_orn2_b32">; -def S_ORN2_B64 : SOP2_64 <"s_orn2_b64">; -def S_NAND_B32 : SOP2_32 <"s_nand_b32">; -def S_NAND_B64 : SOP2_64 <"s_nand_b64">; -def S_NOR_B32 : SOP2_32 <"s_nor_b32">; -def S_NOR_B64 : SOP2_64 <"s_nor_b64">; +def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32", + [(set i32:$sdst, (UniformBinFrag i32:$src0, (UniformUnaryFrag i32:$src1)))] +>; + +def S_ANDN2_B64 : SOP2_64 <"s_andn2_b64", + [(set i64:$sdst, (UniformBinFrag i64:$src0, (UniformUnaryFrag i64:$src1)))] +>; + +def S_ORN2_B32 : SOP2_32 <"s_orn2_b32", + [(set i32:$sdst, (UniformBinFrag i32:$src0, (UniformUnaryFrag i32:$src1)))] +>; + +def S_ORN2_B64 : SOP2_64 <"s_orn2_b64", + [(set i64:$sdst, (UniformBinFrag i64:$src0, (UniformUnaryFrag i64:$src1)))] +>; } // End Defs = [SCC] // Use added complexity so these patterns are preferred to the VALU patterns. Index: test/CodeGen/AMDGPU/constant-fold-mi-operands.ll =================================================================== --- test/CodeGen/AMDGPU/constant-fold-mi-operands.ll +++ test/CodeGen/AMDGPU/constant-fold-mi-operands.ll @@ -107,7 +107,7 @@ ; GCN: v_bcnt_u32_b32{{(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}} ; GCN: v_bcnt_u32_b32{{(_e32)*(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}} ; GCN-DAG: v_not_b32_e32 v[[RESULT_LO]], v[[RESULT_LO]] -; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[RESULT_LO]], v[[VREG1_LO]] +; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[VREG1_LO]], v[[RESULT_LO]] ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], v[[VREG1_HI]] ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} define amdgpu_kernel void @fold_mi_or_neg1(i64 addrspace(1)* %out) { Index: test/CodeGen/AMDGPU/fceil64.ll =================================================================== --- test/CodeGen/AMDGPU/fceil64.ll +++ test/CodeGen/AMDGPU/fceil64.ll @@ -17,8 +17,7 @@ ; are not always followed. ; SI-DAG: s_add_i32 [[SEXP0:s[0-9]+]], [[SEXP]], 0xfffffc01 ; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP0]] -; SI-DAG: s_not_b64 -; SI-DAG: s_and_b64 +; SI-DAG: s_andn2_b64 ; SI-DAG: cmp_gt_i32 ; SI-DAG: cndmask_b32 ; SI-DAG: cndmask_b32 Index: test/CodeGen/AMDGPU/ftrunc.f64.ll =================================================================== --- test/CodeGen/AMDGPU/ftrunc.f64.ll +++ test/CodeGen/AMDGPU/ftrunc.f64.ll @@ -27,8 +27,7 @@ ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 ; SI-DAG: s_add_i32 [[SEXP1:s[0-9]+]], [[SEXP]], 0xfffffc01 ; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP1]] -; SI-DAG: s_not_b64 -; SI-DAG: s_and_b64 +; SI-DAG: s_andn2_b64 ; SI-DAG: cmp_gt_i32 ; SI-DAG: cndmask_b32 ; SI-DAG: cndmask_b32 Index: test/CodeGen/AMDGPU/insert_vector_elt.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -229,8 +229,7 @@ ; VI: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]] ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 ; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]] -; VI: s_not_b32 [[NOT_MASK:s[0-9]+]], [[SHIFTED_MASK]] -; VI: s_and_b32 [[AND_NOT_MASK:s[0-9]+]], [[NOT_MASK]], [[LOAD]] +; VI: s_andn2_b32 [[AND_NOT_MASK:s[0-9]+]], [[LOAD]], [[SHIFTED_MASK]] ; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], 5, [[V_LOAD]] ; VI: s_lshr_b32 [[HI2:s[0-9]+]], [[AND_NOT_MASK]], 16 @@ -269,8 +268,7 @@ ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 ; VI-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff ; VI: s_lshl_b64 s{{\[}}[[MASK_SHIFT_LO:[0-9]+]]:[[MASK_SHIFT_HI:[0-9]+]]{{\]}}, s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}, [[SCALED_IDX]] -; VI: s_not_b64 [[NOT_MASK:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[MASK_SHIFT_LO]]:[[MASK_SHIFT_HI]]{{\]}} -; VI: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[NOT_MASK]], [[VEC]] +; VI: s_andn2_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[VEC]], s{{\[}}[[MASK_SHIFT_LO]]:[[MASK_SHIFT_HI]]{{\]}} ; VI: s_and_b32 s[[INS:[0-9]+]], s[[MASK_SHIFT_LO]], 5 ; VI: s_or_b64 s{{\[}}[[RESULT0:[0-9]+]]:[[RESULT1:[0-9]+]]{{\]}}, s{{\[}}[[INS]]:[[MASK_HI]]{{\]}}, [[AND]] ; VI: v_mov_b32_e32 v[[V_RESULT0:[0-9]+]], s[[RESULT0]] Index: test/CodeGen/AMDGPU/nand.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/nand.ll @@ -0,0 +1,83 @@ +; RUN: llc -march=amdgcn -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX600 %s +; RUN: llc -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX700 %s +; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX801 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX900 %s + +; GCN-LABEL: {{^}}scalar_nand_i32_one_use +; GCN: s_nand_b32 +define amdgpu_kernel void @scalar_nand_i32_one_use( + i32 addrspace(1)* %r0, i32 %a, i32 %b) { +entry: + %and = and i32 %a, %b + %r0.val = xor i32 %and, -1 + store i32 %r0.val, i32 addrspace(1)* %r0 + ret void +} + +; GCN-LABEL: {{^}}scalar_nand_i32_mul_use +; GCN-NOT: s_nand_b32 +; GCN: s_and_b32 +; GCN: s_not_b32 +; GCN: s_add_i32 +define amdgpu_kernel void @scalar_nand_i32_mul_use( + i32 addrspace(1)* %r0, i32 addrspace(1)* %r1, i32 %a, i32 %b) { +entry: + %and = and i32 %a, %b + %r0.val = xor i32 %and, -1 + %r1.val = add i32 %and, %a + store i32 %r0.val, i32 addrspace(1)* %r0 + store i32 %r1.val, i32 addrspace(1)* %r1 + ret void +} + +; GCN-LABEL: {{^}}scalar_nand_i64_one_use +; GCN: s_nand_b64 +define amdgpu_kernel void @scalar_nand_i64_one_use( + i64 addrspace(1)* %r0, i64 %a, i64 %b) { +entry: + %and = and i64 %a, %b + %r0.val = xor i64 %and, -1 + store i64 %r0.val, i64 addrspace(1)* %r0 + ret void +} + +; GCN-LABEL: {{^}}scalar_nand_i64_mul_use +; GCN-NOT: s_nand_b64 +; GCN: s_and_b64 +; GCN: s_not_b64 +; GCN: s_add_u32 +; GCN: s_addc_u32 +define amdgpu_kernel void @scalar_nand_i64_mul_use( + i64 addrspace(1)* %r0, i64 addrspace(1)* %r1, i64 %a, i64 %b) { +entry: + %and = and i64 %a, %b + %r0.val = xor i64 %and, -1 + %r1.val = add i64 %and, %a + store i64 %r0.val, i64 addrspace(1)* %r0 + store i64 %r1.val, i64 addrspace(1)* %r1 + ret void +} + +; GCN-LABEL: {{^}}vector_nand_i32_one_use +; GCN-NOT: s_nand_b32 +; GCN: v_and_b32 +; GCN: v_not_b32 +define i32 @vector_nand_i32_one_use(i32 %a, i32 %b) { +entry: + %and = and i32 %a, %b + %r = xor i32 %and, -1 + ret i32 %r +} + +; GCN-LABEL: {{^}}vector_nand_i64_one_use +; GCN-NOT: s_nand_b64 +; GCN: v_and_b32 +; GCN: v_and_b32 +; GCN: v_not_b32 +; GCN: v_not_b32 +define i64 @vector_nand_i64_one_use(i64 %a, i64 %b) { +entry: + %and = and i64 %a, %b + %r = xor i64 %and, -1 + ret i64 %r +} Index: test/CodeGen/AMDGPU/xnor.ll =================================================================== --- test/CodeGen/AMDGPU/xnor.ll +++ test/CodeGen/AMDGPU/xnor.ll @@ -73,10 +73,10 @@ ; GCN-LABEL: {{^}}vector_xnor_i64_one_use ; GCN-NOT: s_xnor_b64 -; GCN: v_xor_b32 -; GCN: v_xor_b32 ; GCN: v_not_b32 +; GCN: v_xor_b32 ; GCN: v_not_b32 +; GCN: v_xor_b32 ; GCN-DL: v_xnor_b32 ; GCN-DL: v_xnor_b32 define i64 @vector_xnor_i64_one_use(i64 %a, i64 %b) {