Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -402,6 +402,7 @@ MAD_I64_I32, MUL_LOHI_I24, MUL_LOHI_U24, + PERM, TEXTURE_FETCH, EXPORT, // exp on SI+ EXPORT_DONE, // exp on SI+ with done bit set Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4119,6 +4119,7 @@ NODE_NAME_CASE(MAD_I24) NODE_NAME_CASE(MAD_I64_I32) NODE_NAME_CASE(MAD_U64_U32) + NODE_NAME_CASE(PERM) NODE_NAME_CASE(TEXTURE_FETCH) NODE_NAME_CASE(EXPORT) NODE_NAME_CASE(EXPORT_DONE) @@ -4374,6 +4375,34 @@ Known.Zero.setHighBits(32 - MaxValBits); break; } + case AMDGPUISD::PERM: { + ConstantSDNode *CMask = dyn_cast(Op.getOperand(2)); + if (!CMask) + return; + + KnownBits LHSKnown, RHSKnown; + DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1); + DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1); + unsigned Sel = CMask->getZExtValue(); + + for (unsigned I = 0; I < 32; I += 8) { + unsigned ByteMask = 0xff << I; + unsigned SelBits = Sel & 0xff; + if (SelBits < 4) { + Known.One |= RHSKnown.One & ByteMask; + Known.Zero |= RHSKnown.Zero & ByteMask; + } else if (SelBits < 7) { + Known.One |= LHSKnown.One & ByteMask; + Known.Zero |= LHSKnown.Zero & ByteMask; + } else if (SelBits == 0x0c) { + Known.Zero |= ByteMask; + } else if (SelBits > 0x0c) { + Known.One |= ByteMask; + } + Sel >>= 8; + } + break; + } case ISD::INTRINSIC_WO_CHAIN: { unsigned IID = cast(Op.getOperand(0))->getZExtValue(); switch (IID) { Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -339,6 +339,8 @@ def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>; +def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>; + def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC", SDTypeProfile<0, 1, [SDTCisInt<0>]>, [SDNPHasChain, SDNPInGlue]>; Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6135,6 +6135,71 @@ return false; } +// If a constant has all zeroes or all ones within each byte return it. +// Otherwise return 0. +static uint32_t getConstantPermuteMask(uint32_t C) { + // 0xff for any zero byte in the mask + uint32_t ZeroByteMask = 0; + if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff; + if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00; + if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000; + if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000; + uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte + if ((NonZeroByteMask & C) != NonZeroByteMask) + return 0; // Partial bytes selected. + return C; +} + +// Check if a node selects whole bytes from its operand 0 starting at a byte +// boundary while masking the rest. Returns select mask as in the v_perm_b32 +// or -1 if not succeeded. +// Note byte select encoding: +// value 0-3 selects corresponding source byte; +// value 0xc selects zero; +// value 0xff selects 0xff. +static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) { + assert(V.getValueSizeInBits() == 32); + + if (V.getNumOperands() != 2) + return ~0; + + ConstantSDNode *N1 = dyn_cast(V.getOperand(1)); + if (!N1) + return ~0; + + uint32_t C = N1->getZExtValue(); + + switch (V.getOpcode()) { + default: + break; + case ISD::AND: + if (uint32_t ConstMask = getConstantPermuteMask(C)) { + return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask); + } + break; + + case ISD::OR: + if (uint32_t ConstMask = getConstantPermuteMask(C)) { + return (0x03020100 & ~ConstMask) | ConstMask; + } + break; + + case ISD::SHL: + if (C % 8) + return ~0; + + return uint32_t((0x030201000c0c0c0cull << C) >> 32); + + case ISD::SRL: + if (C % 8) + return ~0; + + return uint32_t(0x0c0c0c0c03020100ull >> C); + } + + return ~0; +} + SDValue SITargetLowering::performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (DCI.isBeforeLegalize()) @@ -6181,6 +6246,20 @@ } } } + + // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2) + if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM && + isa(LHS.getOperand(2))) { + uint32_t Sel = getConstantPermuteMask(Mask); + if (!Sel) + return SDValue(); + + // Select 0xc for all zero bytes + Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c); + SDLoc DL(N); + return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0), + LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32)); + } } // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> @@ -6233,6 +6312,54 @@ LHS, DAG.getConstant(0, SDLoc(N), MVT::i32)); } + // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2) + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() && + N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) { + uint32_t LHSMask = getPermuteMask(DAG, LHS); + uint32_t RHSMask = getPermuteMask(DAG, RHS); + if (LHSMask != ~0u && RHSMask != ~0u) { + // Canonicalize the expression in an attempt to have fewer unique masks + // and therefore fewer registers used to hold the masks. + if (LHSMask > RHSMask) { + std::swap(LHSMask, RHSMask); + std::swap(LHS, RHS); + } + + // Select 0xc for each lane used from source operand. Zero has 0xc mask + // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range. + uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; + uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; + + // Check of we need to combine values from two sources within a byte. + if (!(LHSUsedLanes & RHSUsedLanes) && + // If we select high and lower word keep it for SDWA. + // TODO: teach SDWA to work with v_perm_b32 and remove the check. + !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) { + // Each byte in each mask is either selector mask 0-3, or has higher + // bits set in either of masks, which can be 0xff for 0xff or 0x0c for + // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise + // mask which is not 0xff wins. By anding both masks we have a correct + // result except that 0x0c shall be corrected to give 0x0c only. + uint32_t Mask = LHSMask & RHSMask; + for (unsigned I = 0; I < 32; I += 8) { + uint32_t ByteSel = 0xff << I; + if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c) + Mask &= (0x0c << I) & 0xffffffff; + } + + // Add 4 to each active LHS lane. It will not affect any existing 0xff + // or 0x0c. + uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404); + SDLoc DL(N); + + return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, + LHS.getOperand(0), RHS.getOperand(0), + DAG.getConstant(Sel, DL, MVT::i32)); + } + } + } + return SDValue(); } @@ -6268,6 +6395,60 @@ return SDValue(); } + // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2) + if (isa(RHS) && LHS.hasOneUse() && + LHS.getOpcode() == AMDGPUISD::PERM && + isa(LHS.getOperand(2))) { + uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1)); + if (!Sel) + return SDValue(); + + Sel |= LHS.getConstantOperandVal(2); + SDLoc DL(N); + return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0), + LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32)); + } + + // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2) + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() && + N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) { + uint32_t LHSMask = getPermuteMask(DAG, LHS); + uint32_t RHSMask = getPermuteMask(DAG, RHS); + if (LHSMask != ~0u && RHSMask != ~0u) { + // Canonicalize the expression in an attempt to have fewer unique masks + // and therefore fewer registers used to hold the masks. + if (LHSMask > RHSMask) { + std::swap(LHSMask, RHSMask); + std::swap(LHS, RHS); + } + + // Select 0xc for each lane used from source operand. Zero has 0xc mask + // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range. + uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; + uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; + + // Check of we need to combine values from two sources within a byte. + if (!(LHSUsedLanes & RHSUsedLanes) && + // If we select high and lower word keep it for SDWA. + // TODO: teach SDWA to work with v_perm_b32 and remove the check. + !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) { + // Kill zero bytes selected by other mask. Zero value is 0xc. + LHSMask &= ~RHSUsedLanes; + RHSMask &= ~LHSUsedLanes; + // Add 4 to each active LHS lane + LHSMask |= LHSUsedLanes & 0x04040404; + // Combine masks + uint32_t Sel = LHSMask | RHSMask; + SDLoc DL(N); + + return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, + LHS.getOperand(0), RHS.getOperand(0), + DAG.getConstant(Sel, DL, MVT::i32)); + } + } + } + if (VT != MVT::i64) return SDValue(); Index: llvm/trunk/lib/Target/AMDGPU/VOP3Instructions.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/VOP3Instructions.td +++ llvm/trunk/lib/Target/AMDGPU/VOP3Instructions.td @@ -449,7 +449,7 @@ def V_INTERP_P2_F32_e64 : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>; def V_INTERP_MOV_F32_e64 : VOP3Interp <"v_interp_mov_f32", VOP3_INTERP_MOV>; -def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile>; +def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile, AMDGPUperm>; } // End SubtargetPredicate = isVI let Predicates = [Has16BitInsts] in { Index: llvm/trunk/test/CodeGen/AMDGPU/permute.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/permute.ll +++ llvm/trunk/test/CodeGen/AMDGPU/permute.ll @@ -0,0 +1,199 @@ +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s + +; GCN-LABEL: {{^}}lsh8_or_and: +; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x6050400 +; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]] +define amdgpu_kernel void @lsh8_or_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) { +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id + %tmp = load i32, i32 addrspace(1)* %gep, align 4 + %tmp2 = shl i32 %tmp, 8 + %tmp3 = and i32 %arg1, 255 + %tmp4 = or i32 %tmp2, %tmp3 + store i32 %tmp4, i32 addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}lsr24_or_and: +; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7060503 +; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]] +define amdgpu_kernel void @lsr24_or_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) { +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id + %tmp = load i32, i32 addrspace(1)* %gep, align 4 + %tmp2 = lshr i32 %tmp, 24 + %tmp3 = and i32 %arg1, 4294967040 ; 0xffffff00 + %tmp4 = or i32 %tmp2, %tmp3 + store i32 %tmp4, i32 addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}and_or_lsr24: +; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7060503 +; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]] +define amdgpu_kernel void @and_or_lsr24(i32 addrspace(1)* nocapture %arg, i32 %arg1) { +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id + %tmp = load i32, i32 addrspace(1)* %gep, align 4 + %tmp2 = and i32 %tmp, 4294967040 ; 0xffffff00 + %tmp3 = lshr i32 %arg1, 24 + %tmp4 = or i32 %tmp2, %tmp3 + %tmp5 = xor i32 %tmp4, -2147483648 + store i32 %tmp5, i32 addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}and_or_and: +; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7020500 +; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]] +define amdgpu_kernel void @and_or_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) { +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id + %tmp = load i32, i32 addrspace(1)* %gep, align 4 + %tmp2 = and i32 %tmp, -16711936 + %tmp3 = and i32 %arg1, 16711935 + %tmp4 = or i32 %tmp2, %tmp3 + store i32 %tmp4, i32 addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}lsh8_or_lsr24: +; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x6050403 +; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]] +define amdgpu_kernel void @lsh8_or_lsr24(i32 addrspace(1)* nocapture %arg, i32 %arg1) { +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id + %tmp = load i32, i32 addrspace(1)* %gep, align 4 + %tmp2 = shl i32 %tmp, 8 + %tmp3 = lshr i32 %arg1, 24 + %tmp4 = or i32 %tmp2, %tmp3 + store i32 %tmp4, i32 addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}lsh16_or_lsr24: +; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x5040c03 +; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]] +define amdgpu_kernel void @lsh16_or_lsr24(i32 addrspace(1)* nocapture %arg, i32 %arg1) { +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id + %tmp = load i32, i32 addrspace(1)* %gep, align 4 + %tmp2 = shl i32 %tmp, 16 + %tmp3 = lshr i32 %arg1, 24 + %tmp4 = or i32 %tmp2, %tmp3 + store i32 %tmp4, i32 addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}and_xor_and: +; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7020104 +; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]] +define amdgpu_kernel void @and_xor_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) { +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id + %tmp = load i32, i32 addrspace(1)* %gep, align 4 + %tmp2 = and i32 %tmp, -16776961 + %tmp3 = and i32 %arg1, 16776960 + %tmp4 = xor i32 %tmp2, %tmp3 + store i32 %tmp4, i32 addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}and_or_or_and: +; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff0500 +; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]] +define amdgpu_kernel void @and_or_or_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) { +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id + %tmp = load i32, i32 addrspace(1)* %gep, align 4 + %and = and i32 %tmp, 16711935 ; 0x00ff00ff + %tmp1 = and i32 %arg1, 4294967040 ; 0xffffff00 + %tmp2 = or i32 %tmp1, -65536 + %tmp3 = or i32 %tmp2, %and + store i32 %tmp3, i32 addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}and_or_and_shl: +; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x50c0c00 +; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]] +define amdgpu_kernel void @and_or_and_shl(i32 addrspace(1)* nocapture %arg, i32 %arg1) { +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id + %tmp = load i32, i32 addrspace(1)* %gep, align 4 + %tmp2 = shl i32 %tmp, 16 + %tmp3 = and i32 %arg1, 65535 + %tmp4 = or i32 %tmp2, %tmp3 + %and = and i32 %tmp4, 4278190335 + store i32 %and, i32 addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}or_and_or: +; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7020104 +; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]] +define amdgpu_kernel void @or_and_or(i32 addrspace(1)* nocapture %arg, i32 %arg1) { +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id + %tmp = load i32, i32 addrspace(1)* %gep, align 4 + %or1 = or i32 %tmp, 16776960 ; 0x00ffff00 + %or2 = or i32 %arg1, 4278190335 ; 0xff0000ff + %and = and i32 %or1, %or2 + store i32 %and, i32 addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}known_ffff0500: +; GCN-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff0500 +; GCN-DAG: v_mov_b32_e32 [[RES:v[0-9]+]], 0xffff8004 +; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]] +; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}} +define amdgpu_kernel void @known_ffff0500(i32 addrspace(1)* nocapture %arg, i32 %arg1) { +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id + %load = load i32, i32 addrspace(1)* %gep, align 4 + %mask1 = or i32 %arg1, 32768 ; 0x8000 + %mask2 = or i32 %load, 4 + %and = and i32 %mask2, 16711935 ; 0x00ff00ff + %tmp1 = and i32 %mask1, 4294967040 ; 0xffffff00 + %tmp2 = or i32 %tmp1, 4294901760 ; 0xffff0000 + %tmp3 = or i32 %tmp2, %and + store i32 %tmp3, i32 addrspace(1)* %gep, align 4 + %v = and i32 %tmp3, 4294934532 ; 0xffff8004 + store i32 %v, i32 addrspace(1)* %arg, align 4 + ret void +} + +; GCN-LABEL: {{^}}known_050c0c00: +; GCN-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x50c0c00 +; GCN-DAG: v_mov_b32_e32 [[RES:v[0-9]+]], 4{{$}} +; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]] +; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}} +define amdgpu_kernel void @known_050c0c00(i32 addrspace(1)* nocapture %arg, i32 %arg1) { +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id + %tmp = load i32, i32 addrspace(1)* %gep, align 4 + %tmp2 = shl i32 %tmp, 16 + %mask = or i32 %arg1, 4 + %tmp3 = and i32 %mask, 65535 + %tmp4 = or i32 %tmp2, %tmp3 + %and = and i32 %tmp4, 4278190335 + store i32 %and, i32 addrspace(1)* %gep, align 4 + %v = and i32 %and, 16776964 + store i32 %v, i32 addrspace(1)* %arg, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x()