Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -62,7 +62,10 @@ bool shouldCombineMemoryType(EVT VT) const; SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; - SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const; + + SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, + unsigned Opc, SDValue LHS, + uint32_t ValLo, uint32_t ValHi) const; SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -462,7 +462,6 @@ MaxStoresPerMemset = 4096; setTargetDAGCombine(ISD::BITCAST); - setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRA); setTargetDAGCombine(ISD::SRL); @@ -2093,38 +2092,21 @@ SN->getBasePtr(), SN->getMemOperand()); } -// TODO: Should repeat for other bit ops. -SDValue AMDGPUTargetLowering::performAndCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - if (N->getValueType(0) != MVT::i64) - return SDValue(); - - // Break up 64-bit and of a constant into two 32-bit ands. This will typically - // happen anyway for a VALU 64-bit and. This exposes other 32-bit integer - // combine opportunities since most 64-bit operations are decomposed this way. - // TODO: We won't want this for SALU especially if it is an inline immediate. - const ConstantSDNode *RHS = dyn_cast(N->getOperand(1)); - if (!RHS) - return SDValue(); - - uint64_t Val = RHS->getZExtValue(); - if (Lo_32(Val) != 0 && Hi_32(Val) != 0 && !RHS->hasOneUse()) { - // If either half of the constant is 0, this is really a 32-bit and, so - // split it. If we can re-use the full materialized constant, keep it. - return SDValue(); - } - - SDLoc SL(N); +/// Split the 64-bit value \p LHS into two 32-bit components, and perform the +/// binary operation \p Opc to it with the corresponding constant operands. +SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl( + DAGCombinerInfo &DCI, const SDLoc &SL, + unsigned Opc, SDValue LHS, + uint32_t ValLo, uint32_t ValHi) const { SelectionDAG &DAG = DCI.DAG; - SDValue Lo, Hi; - std::tie(Lo, Hi) = split64BitValue(N->getOperand(0), DAG); + std::tie(Lo, Hi) = split64BitValue(LHS, DAG); - SDValue LoRHS = DAG.getConstant(Lo_32(Val), SL, MVT::i32); - SDValue HiRHS = DAG.getConstant(Hi_32(Val), SL, MVT::i32); + SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32); + SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32); - SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, LoRHS); - SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, HiRHS); + SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS); + SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS); // Re-visit the ands. It's possible we eliminated one of them and it could // simplify the vector. @@ -2518,12 +2500,6 @@ return performSraCombine(N, DCI); } - case ISD::AND: { - if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) - break; - - return performAndCombine(N, DCI); - } case ISD::MUL: return performMulCombine(N, DCI); case ISD::MULHS: Index: lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- lib/Target/AMDGPU/SIFoldOperands.cpp +++ lib/Target/AMDGPU/SIFoldOperands.cpp @@ -304,6 +304,126 @@ return; } +static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, + int32_t LHS, int32_t RHS) { + switch (Opcode) { + case AMDGPU::V_AND_B32_e64: + case AMDGPU::S_AND_B32: + Result = LHS & RHS; + return true; + case AMDGPU::V_OR_B32_e64: + case AMDGPU::S_OR_B32: + Result = LHS | RHS; + return true; + case AMDGPU::V_XOR_B32_e64: + case AMDGPU::S_XOR_B32: + Result = LHS ^ RHS; + return true; + default: + return false; + } +} + +static unsigned getMovOpc(bool IsScalar) { + return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; +} + +// Try to simplify operations with a constant that may appear after instruction +// selection. +static bool tryConstantFoldOp(MachineRegisterInfo &MRI, + const SIInstrInfo *TII, + MachineInstr *MI) { + unsigned Opc = MI->getOpcode(); + + if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 || + Opc == AMDGPU::S_NOT_B32) { + MachineOperand &Src0 = MI->getOperand(1); + if (Src0.isImm()) { + Src0.setImm(~Src0.getImm()); + MI->setDesc(TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32))); + return true; + } + + return false; + } + + if (!MI->isCommutable()) + return false; + + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); + + MachineOperand *Src0 = &MI->getOperand(Src0Idx); + MachineOperand *Src1 = &MI->getOperand(Src1Idx); + if (!Src0->isImm() && !Src1->isImm()) + return false; + + // and k0, k1 -> v_mov_b32 (k0 & k1) + // or k0, k1 -> v_mov_b32 (k0 | k1) + // xor k0, k1 -> v_mov_b32 (k0 ^ k1) + if (Src0->isImm() && Src1->isImm()) { + int32_t NewImm; + if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm())) + return false; + + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg()); + + Src0->setImm(NewImm); + MI->RemoveOperand(Src1Idx); + MI->setDesc(TII->get(getMovOpc(IsSGPR))); + return true; + } + + if (Src0->isImm() && !Src1->isImm()) { + std::swap(Src0, Src1); + std::swap(Src0Idx, Src1Idx); + } + + int32_t Src1Val = static_cast(Src1->getImm()); + if (Opc == AMDGPU::V_OR_B32_e64 || Opc == AMDGPU::S_OR_B32) { + if (Src1Val == 0) { + // y = or x, 0 => y = copy x + MI->RemoveOperand(Src1Idx); + MI->setDesc(TII->get(AMDGPU::COPY)); + } else if (Src1Val == -1) { + // y = or x, -1 => y = v_mov_b32 -1 + MI->RemoveOperand(Src1Idx); + MI->setDesc(TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32))); + } else + return false; + + return true; + } + + if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 || + MI->getOpcode() == AMDGPU::S_AND_B32) { + if (Src1Val == 0) { + // y = and x, 0 => y = v_mov_b32 0 + MI->RemoveOperand(Src0Idx); + MI->setDesc(TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32))); + } else if (Src1Val == -1) { + // y = and x, -1 => y = copy x + MI->RemoveOperand(Src1Idx); + MI->setDesc(TII->get(AMDGPU::COPY)); + } else + return false; + + return true; + } + + if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 || + MI->getOpcode() == AMDGPU::S_XOR_B32) { + if (Src1Val == 0) { + // y = xor x, 0 => y = copy x + MI->RemoveOperand(Src1Idx); + MI->setDesc(TII->get(AMDGPU::COPY)); + } + } + + return false; +} + bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(*MF.getFunction())) return false; @@ -389,6 +509,12 @@ } DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << Fold.UseOpNo << " of " << *Fold.UseMI << '\n'); + + // Folding the immediate may reveal operations that can be constant + // folded or replaced with a copy. This can happen for example after + // frame indices are lowered to constants or from splitting 64-bit + // constants. + tryConstantFoldOp(MRI, TII, Fold.UseMI); } } } Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -58,8 +58,14 @@ SDValue performSHLPtrCombine(SDNode *N, unsigned AS, DAGCombinerInfo &DCI) const; + + SDValue splitBinaryBitConstantOp(DAGCombinerInfo &DCI, const SDLoc &SL, + unsigned Opc, SDValue LHS, + const ConstantSDNode *CRHS) const; + SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performXorCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFCanonicalizeCombine(SDNode *N, DAGCombinerInfo &DCI) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -227,6 +227,7 @@ setTargetDAGCombine(ISD::SETCC); setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::OR); + setTargetDAGCombine(ISD::XOR); setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::FCANONICALIZE); @@ -2894,23 +2895,62 @@ return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset); } +static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) { + return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) || + (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) || + (Opc == ISD::XOR && Val == 0); +} + +// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This +// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit +// integer combine opportunities since most 64-bit operations are decomposed +// this way. TODO: We won't want this for SALU especially if it is an inline +// immediate. +SDValue SITargetLowering::splitBinaryBitConstantOp( + DAGCombinerInfo &DCI, + const SDLoc &SL, + unsigned Opc, SDValue LHS, + const ConstantSDNode *CRHS) const { + uint64_t Val = CRHS->getZExtValue(); + uint32_t ValLo = Lo_32(Val); + uint32_t ValHi = Hi_32(Val); + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + + if ((bitOpWithConstantIsReducible(Opc, ValLo) || + bitOpWithConstantIsReducible(Opc, ValHi)) || + (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) { + // If we need to materialize a 64-bit immediate, it will be split up later + // anyway. Avoid creating the harder to understand 64-bit immediate + // materialization. + return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi); + } + + return SDValue(); +} + SDValue SITargetLowering::performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (DCI.isBeforeLegalize()) return SDValue(); - if (SDValue Base = AMDGPUTargetLowering::performAndCombine(N, DCI)) - return Base; - SelectionDAG &DAG = DCI.DAG; - - // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> - // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity) + EVT VT = N->getValueType(0); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); - if (LHS.getOpcode() == ISD::SETCC && - RHS.getOpcode() == ISD::SETCC) { + + if (VT == MVT::i64) { + const ConstantSDNode *CRHS = dyn_cast(RHS); + if (CRHS) { + if (SDValue Split + = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS)) + return Split; + } + } + + // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> + // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity) + if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) { ISD::CondCode LCC = cast(LHS.getOperand(2))->get(); ISD::CondCode RCC = cast(RHS.getOperand(2))->get(); @@ -2958,54 +2998,85 @@ SDValue RHS = N->getOperand(1); EVT VT = N->getValueType(0); - if (VT == MVT::i64) { - // TODO: This could be a generic combine with a predicate for extracting the - // high half of an integer being free. - - // (or i64:x, (zero_extend i32:y)) -> - // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x))) - if (LHS.getOpcode() == ISD::ZERO_EXTEND && - RHS.getOpcode() != ISD::ZERO_EXTEND) - std::swap(LHS, RHS); - - if (RHS.getOpcode() == ISD::ZERO_EXTEND) { - SDValue ExtSrc = RHS.getOperand(0); - EVT SrcVT = ExtSrc.getValueType(); - if (SrcVT == MVT::i32) { - SDLoc SL(N); - SDValue LowLHS, HiBits; - std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG); - SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc); - - DCI.AddToWorklist(LowOr.getNode()); - DCI.AddToWorklist(HiBits.getNode()); - - SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, - LowOr, HiBits); - return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); - } + if (VT == MVT::i1) { + // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2) + if (LHS.getOpcode() == AMDGPUISD::FP_CLASS && + RHS.getOpcode() == AMDGPUISD::FP_CLASS) { + SDValue Src = LHS.getOperand(0); + if (Src != RHS.getOperand(0)) + return SDValue(); + + const ConstantSDNode *CLHS = dyn_cast(LHS.getOperand(1)); + const ConstantSDNode *CRHS = dyn_cast(RHS.getOperand(1)); + if (!CLHS || !CRHS) + return SDValue(); + + // Only 10 bits are used. + static const uint32_t MaxMask = 0x3ff; + + uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask; + SDLoc DL(N); + return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, + Src, DAG.getConstant(NewMask, DL, MVT::i32)); } + + return SDValue(); } - // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2) - if (LHS.getOpcode() == AMDGPUISD::FP_CLASS && - RHS.getOpcode() == AMDGPUISD::FP_CLASS) { - SDValue Src = LHS.getOperand(0); - if (Src != RHS.getOperand(0)) - return SDValue(); + if (VT != MVT::i64) + return SDValue(); - const ConstantSDNode *CLHS = dyn_cast(LHS.getOperand(1)); - const ConstantSDNode *CRHS = dyn_cast(RHS.getOperand(1)); - if (!CLHS || !CRHS) - return SDValue(); + // TODO: This could be a generic combine with a predicate for extracting the + // high half of an integer being free. + + // (or i64:x, (zero_extend i32:y)) -> + // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x))) + if (LHS.getOpcode() == ISD::ZERO_EXTEND && + RHS.getOpcode() != ISD::ZERO_EXTEND) + std::swap(LHS, RHS); + + if (RHS.getOpcode() == ISD::ZERO_EXTEND) { + SDValue ExtSrc = RHS.getOperand(0); + EVT SrcVT = ExtSrc.getValueType(); + if (SrcVT == MVT::i32) { + SDLoc SL(N); + SDValue LowLHS, HiBits; + std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG); + SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc); + + DCI.AddToWorklist(LowOr.getNode()); + DCI.AddToWorklist(HiBits.getNode()); + + SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, + LowOr, HiBits); + return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); + } + } - // Only 10 bits are used. - static const uint32_t MaxMask = 0x3ff; + const ConstantSDNode *CRHS = dyn_cast(N->getOperand(1)); + if (CRHS) { + if (SDValue Split + = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS)) + return Split; + } + + return SDValue(); +} + +SDValue SITargetLowering::performXorCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + EVT VT = N->getValueType(0); + if (VT != MVT::i64) + return SDValue(); + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); - uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask; - SDLoc DL(N); - return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, - Src, DAG.getConstant(NewMask, DL, MVT::i32)); + const ConstantSDNode *CRHS = dyn_cast(RHS); + if (CRHS) { + if (SDValue Split + = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS)) + return Split; } return SDValue(); @@ -3422,6 +3493,8 @@ return performAndCombine(N, DCI); case ISD::OR: return performOrCombine(N, DCI); + case ISD::XOR: + return performXorCombine(N, DCI); case AMDGPUISD::FP_CLASS: return performClassCombine(N, DCI); case ISD::FCANONICALIZE: Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -1886,7 +1886,7 @@ def : Pat < (fabs f32:$src), - (V_AND_B32_e32 $src, (V_MOV_B32_e32 0x7fffffff)) + (V_AND_B32_e64 $src, (V_MOV_B32_e32 0x7fffffff)) >; def : Pat < @@ -1899,7 +1899,7 @@ (REG_SEQUENCE VReg_64, (i32 (EXTRACT_SUBREG f64:$src, sub0)), sub0, - (V_AND_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), + (V_AND_B32_e64 (EXTRACT_SUBREG f64:$src, sub1), (V_MOV_B32_e32 0x7fffffff)), // Set sign bit. sub1) >; Index: test/CodeGen/AMDGPU/and.ll =================================================================== --- test/CodeGen/AMDGPU/and.ll +++ test/CodeGen/AMDGPU/and.ll @@ -324,6 +324,20 @@ ret void } +; FIXME: Should be able to reduce load width +; FUNC-LABEL: {{^}}v_and_inline_neg_imm_i64: +; SI: buffer_load_dwordx2 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} +; SI-NOT: and +; SI: v_and_b32_e32 v[[VAL_LO]], -8, v[[VAL_LO]] +; SI-NOT: and +; SI: buffer_store_dwordx2 v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}} +define void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { + %a = load i64, i64 addrspace(1)* %aptr, align 8 + %and = and i64 %a, -8 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + ; FUNC-LABEL: {{^}}s_and_inline_imm_64_i64 ; SI: s_load_dword ; SI-NOT: and Index: test/CodeGen/AMDGPU/bitreverse.ll =================================================================== --- test/CodeGen/AMDGPU/bitreverse.ll +++ test/CodeGen/AMDGPU/bitreverse.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s declare i16 @llvm.bitreverse.i16(i16) #1 @@ -79,6 +79,7 @@ } ; FUNC-LABEL: {{^}}v_brev_i64: +; SI-NOT: v_or_b32_e64 v{{[0-9]+}}, 0, 0 define void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 { %val = load i64, i64 addrspace(1)* %valptr %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1 Index: test/CodeGen/AMDGPU/bswap.ll =================================================================== --- test/CodeGen/AMDGPU/bswap.ll +++ test/CodeGen/AMDGPU/bswap.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s declare i32 @llvm.bswap.i32(i32) nounwind readnone @@ -93,6 +93,8 @@ ret void } +; FUNC-LABEL: {{^}}test_bswap_i64: +; SI-NOT: v_or_b32_e64 v{{[0-9]+}}, 0, 0 define void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { %val = load i64, i64 addrspace(1)* %in, align 8 %bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone Index: test/CodeGen/AMDGPU/constant-fold-mi-operands.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/constant-fold-mi-operands.ll @@ -0,0 +1,144 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}fold_mi_v_and_0: +; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} +; GCN-NOT: [[RESULT]] +; GCN: buffer_store_dword [[RESULT]] +define void @fold_mi_v_and_0(i32 addrspace(1)* %out) { + %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %size = call i32 @llvm.amdgcn.groupstaticsize() + %and = and i32 %size, %x + store i32 %and, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fold_mi_s_and_0: +; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} +; GCN-NOT: [[RESULT]] +; GCN: buffer_store_dword [[RESULT]] +define void @fold_mi_s_and_0(i32 addrspace(1)* %out, i32 %x) #0 { + %size = call i32 @llvm.amdgcn.groupstaticsize() + %and = and i32 %size, %x + store i32 %and, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fold_mi_v_or_0: +; GCN: v_mbcnt_lo_u32_b32_e64 [[RESULT:v[0-9]+]] +; GCN-NOT: [[RESULT]] +; GCN: buffer_store_dword [[RESULT]] +define void @fold_mi_v_or_0(i32 addrspace(1)* %out) { + %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %size = call i32 @llvm.amdgcn.groupstaticsize() + %or = or i32 %size, %x + store i32 %or, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fold_mi_s_or_0: +; GCN: s_load_dword [[SVAL:s[0-9]+]] +; GCN-NOT: [[SVAL]] +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] +; GCN-NOT: [[VVAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @fold_mi_s_or_0(i32 addrspace(1)* %out, i32 %x) #0 { + %size = call i32 @llvm.amdgcn.groupstaticsize() + %or = or i32 %size, %x + store i32 %or, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fold_mi_v_xor_0: +; GCN: v_mbcnt_lo_u32_b32_e64 [[RESULT:v[0-9]+]] +; GCN-NOT: [[RESULT]] +; GCN: buffer_store_dword [[RESULT]] +define void @fold_mi_v_xor_0(i32 addrspace(1)* %out) { + %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %size = call i32 @llvm.amdgcn.groupstaticsize() + %xor = xor i32 %size, %x + store i32 %xor, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fold_mi_s_xor_0: +; GCN: s_load_dword [[SVAL:s[0-9]+]] +; GCN-NOT: [[SVAL]] +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] +; GCN-NOT: [[VVAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @fold_mi_s_xor_0(i32 addrspace(1)* %out, i32 %x) #0 { + %size = call i32 @llvm.amdgcn.groupstaticsize() + %xor = xor i32 %size, %x + store i32 %xor, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fold_mi_s_not_0: +; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], -1{{$}} +; GCN-NOT: [[RESULT]] +; GCN: buffer_store_dword [[RESULT]] +define void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 { + %size = call i32 @llvm.amdgcn.groupstaticsize() + %xor = xor i32 %size, -1 + store i32 %xor, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fold_mi_v_not_0: +; GCN: v_bcnt_u32_b32_e64 v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}} +; GCN: v_bcnt_u32_b32_e{{[0-9]+}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}} +; GCN-NEXT: v_not_b32_e32 v[[RESULT_LO]] +; GCN-NEXT: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], -1{{$}} +; GCN-NEXT: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} +define void @fold_mi_v_not_0(i64 addrspace(1)* %out) { + %vreg = load volatile i64, i64 addrspace(1)* undef + %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg) + %xor = xor i64 %ctpop, -1 + store i64 %xor, i64 addrspace(1)* %out + ret void +} + +; The neg1 appears after folding the not 0 +; GCN-LABEL: {{^}}fold_mi_or_neg1: +; GCN: buffer_load_dwordx2 +; GCN: buffer_load_dwordx2 v{{\[}}[[VREG1_LO:[0-9]+]]:[[VREG1_HI:[0-9]+]]{{\]}} + +; GCN: v_bcnt_u32_b32_e64 v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}} +; GCN: v_bcnt_u32_b32_e{{[0-9]+}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}} +; GCN-DAG: v_not_b32_e32 v[[RESULT_LO]], v[[RESULT_LO]] +; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[VREG1_LO]], v[[RESULT_LO]] +; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], v[[VREG1_HI]] +; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} +define void @fold_mi_or_neg1(i64 addrspace(1)* %out) { + %vreg0 = load volatile i64, i64 addrspace(1)* undef + %vreg1 = load volatile i64, i64 addrspace(1)* undef + %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg0) + %xor = xor i64 %ctpop, -1 + %or = or i64 %xor, %vreg1 + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fold_mi_and_neg1: +; GCN: v_bcnt_u32_b32 +; GCN: v_bcnt_u32_b32 +; GCN: v_not_b32 +; GCN: v_and_b32 +; GCN-NOT: v_and_b32 +define void @fold_mi_and_neg1(i64 addrspace(1)* %out) { + %vreg0 = load volatile i64, i64 addrspace(1)* undef + %vreg1 = load volatile i64, i64 addrspace(1)* undef + %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg0) + %xor = xor i64 %ctpop, -1 + %and = and i64 %xor, %vreg1 + store i64 %and, i64 addrspace(1)* %out + ret void +} + +declare i64 @llvm.ctpop.i64(i64) #1 +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 +declare i32 @llvm.amdgcn.groupstaticsize() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/ctpop64.ll =================================================================== --- test/CodeGen/AMDGPU/ctpop64.ll +++ test/CodeGen/AMDGPU/ctpop64.ll @@ -39,14 +39,13 @@ ret void } -; FIXME: or 0 should be replaxed with copy ; FUNC-LABEL: {{^}}v_ctpop_i64_user: ; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, ; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0 ; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] ; VI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] ; GCN-DAG: v_or_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, [[RESULT]] -; GCN-DAG: v_or_b32_e64 v[[RESULT_HI:[0-9]+]], 0, s{{[0-9]+}} +; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}} ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} ; GCN: s_endpgm define void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind { Index: test/CodeGen/AMDGPU/or.ll =================================================================== --- test/CodeGen/AMDGPU/or.ll +++ test/CodeGen/AMDGPU/or.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s @@ -62,6 +62,75 @@ ret void } +; FUNC-LABEL: {{^}}scalar_or_literal_i64: +; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; SI-DAG: s_or_b32 s[[RES_HI:[0-9]+]], s[[HI]], 0xf237b +; SI-DAG: s_or_b32 s[[RES_LO:[0-9]+]], s[[LO]], 0x3039 +; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]] +; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_HI]] +define void @scalar_or_literal_i64(i64 addrspace(1)* %out, i64 %a) { + %or = or i64 %a, 4261135838621753 + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}scalar_or_literal_multi_use_i64: +; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xf237b +; SI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x3039 +; SI: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} + +; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]] +; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]] +define void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { + %or = or i64 %a, 4261135838621753 + store i64 %or, i64 addrspace(1)* %out + + %foo = add i64 %b, 4261135838621753 + store volatile i64 %foo, i64 addrspace(1)* undef + ret void +} + +; FUNC-LABEL: {{^}}scalar_or_inline_imm_i64: +; SI: s_load_dwordx2 s{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; SI-NOT: or_b32 +; SI: s_or_b32 s[[VAL_LO]], s[[VAL_LO]], 63 +; SI-NOT: or_b32 +; SI: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[VAL_LO]] +; SI-NOT: or_b32 +; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]] +; SI-NOT: or_b32 +; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} +define void @scalar_or_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) { + %or = or i64 %a, 63 + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}scalar_or_inline_imm_multi_use_i64: +; SI-NOT: or_b32 +; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 63 +; SI-NOT: or_b32 +define void @scalar_or_inline_imm_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { + %or = or i64 %a, 63 + store i64 %or, i64 addrspace(1)* %out + %foo = add i64 %b, 63 + store volatile i64 %foo, i64 addrspace(1)* undef + ret void +} + +; FUNC-LABEL: {{^}}scalar_or_neg_inline_imm_i64: +; SI-DAG: s_load_dword [[VAL:s[0-9]+]] +; SI-DAG: s_or_b32 [[VAL]], [[VAL]], -8 +; SI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], -1{{$}} +; SI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[VAL]] +; SI: buffer_store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} +define void @scalar_or_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) { + %or = or i64 %a, -8 + store i64 %or, i64 addrspace(1)* %out + ret void +} + ; FUNC-LABEL: {{^}}vector_or_literal_i32: ; SI: v_or_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} define void @vector_or_literal_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) { @@ -127,8 +196,9 @@ ; FIXME: The or 0 should really be removed. ; FUNC-LABEL: {{^}}vector_or_i64_imm: ; SI: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI: v_or_b32_e32 {{v[0-9]+}}, 8, v[[LO_VREG]] -; SI: v_or_b32_e32 {{v[0-9]+}}, 0, {{.*}} +; SI: v_or_b32_e32 v[[LO_RESULT:[0-9]+]], 8, v[[LO_VREG]] +; SI-NOT: v_or_b32_e32 {{v[0-9]+}}, 0 +; SI: buffer_store_dwordx2 v{{\[}}[[LO_RESULT]]:[[HI_VREG]]{{\]}} ; SI: s_endpgm define void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 8 @@ -137,6 +207,32 @@ ret void } +; FUNC-LABEL: {{^}}vector_or_i64_neg_inline_imm: +; SI-DAG: buffer_load_dword v[[LO_VREG:[0-9]+]] +; SI-DAG: v_or_b32_e32 v[[RES_LO:[0-9]+]], -8, v[[LO_VREG]] +; SI-DAG: v_mov_b32_e32 v[[RES_HI:[0-9]+]], -1{{$}} +; SI: buffer_store_dwordx2 v{{\[}}[[RES_LO]]:[[RES_HI]]{{\]}} +; SI: s_endpgm +define void @vector_or_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 8 + %or = or i64 %loada, -8 + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}vector_or_i64_neg_literal: +; SI-DAG: buffer_load_dword v[[LO_VREG:[0-9]+]] +; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, -1{{$}} +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffffff38, v[[LO_VREG]] +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @vector_or_i64_neg_literal(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 8 + %or = or i64 %loada, -200 + store i64 %or, i64 addrspace(1)* %out + ret void +} + ; FUNC-LABEL: {{^}}trunc_i64_or_to_i32: ; SI: s_load_dword s[[SREG0:[0-9]+]] ; SI: s_load_dword s[[SREG1:[0-9]+]] Index: test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll =================================================================== --- test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll +++ test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll @@ -97,7 +97,6 @@ ; GCN-DAG: v_bfe_u32 v[[ELT2PART:[0-9]+]], v[[VAL3]], 2, 2{{$}} ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[SHLLO]], v[[ELT1PART]] -; GCN-DAG: v_or_b32_e32 v[[OR1:[0-9]+]], 0, v[[SHLHI]]{{$}} ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[OR0]]:[[ZERO]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm Index: test/CodeGen/AMDGPU/sint_to_fp.i64.ll =================================================================== --- test/CodeGen/AMDGPU/sint_to_fp.i64.ll +++ test/CodeGen/AMDGPU/sint_to_fp.i64.ll @@ -38,6 +38,7 @@ } ; FUNC-LABEL: {{^}}s_sint_to_fp_v2i64: +; GCN-NOT: v_and_b32_e32 v{{[0-9]+}}, -1, define void @s_sint_to_fp_v2i64(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{ %result = sitofp <2 x i64> %in to <2 x float> store <2 x float> %result, <2 x float> addrspace(1)* %out Index: test/CodeGen/AMDGPU/xor.ll =================================================================== --- test/CodeGen/AMDGPU/xor.ll +++ test/CodeGen/AMDGPU/xor.ll @@ -171,3 +171,81 @@ store i64 %3, i64 addrspace(1)* %out ret void } + +; FUNC-LABEL: {{^}}scalar_xor_literal_i64: +; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; SI-DAG: s_xor_b32 s[[RES_HI:[0-9]+]], s[[HI]], 0xf237b +; SI-DAG: s_xor_b32 s[[RES_LO:[0-9]+]], s[[LO]], 0x3039 +; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]] +; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_HI]] +define void @scalar_xor_literal_i64(i64 addrspace(1)* %out, i64 %a) { + %or = xor i64 %a, 4261135838621753 + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}scalar_xor_literal_multi_use_i64: +; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xf237b +; SI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x3039 +; SI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} + +; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]] +; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]] +define void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { + %or = xor i64 %a, 4261135838621753 + store i64 %or, i64 addrspace(1)* %out + + %foo = add i64 %b, 4261135838621753 + store volatile i64 %foo, i64 addrspace(1)* undef + ret void +} + +; FUNC-LABEL: {{^}}scalar_xor_inline_imm_i64: +; SI: s_load_dwordx2 s{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; SI-NOT: xor_b32 +; SI: s_xor_b32 s[[VAL_LO]], s[[VAL_LO]], 63 +; SI-NOT: xor_b32 +; SI: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[VAL_LO]] +; SI-NOT: xor_b32 +; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]] +; SI-NOT: xor_b32 +; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} +define void @scalar_xor_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) { + %or = xor i64 %a, 63 + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}scalar_xor_neg_inline_imm_i64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; SI: s_xor_b64 [[VAL]], [[VAL]], -8 +define void @scalar_xor_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) { + %or = xor i64 %a, -8 + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}vector_xor_i64_neg_inline_imm: +; SI: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, +; SI: v_xor_b32_e32 {{v[0-9]+}}, -8, v[[LO_VREG]] +; SI: v_xor_b32_e32 {{v[0-9]+}}, -1, {{.*}} +; SI: s_endpgm +define void @vector_xor_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 8 + %or = xor i64 %loada, -8 + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}vector_xor_literal_i64: +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, +; SI-DAG: v_xor_b32_e32 {{v[0-9]+}}, 0xdf77987f, v[[LO_VREG]] +; SI-DAG: v_xor_b32_e32 {{v[0-9]+}}, 0x146f, v[[HI_VREG]] +; SI: s_endpgm +define void @vector_xor_literal_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 8 + %or = xor i64 %loada, 22470723082367 + store i64 %or, i64 addrspace(1)* %out + ret void +}