diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3292,6 +3292,17 @@ return false; } + // Lets target to control the following reassociation of operands: (op (op x, + // c1), y) -> (op (op x, y), c1) where N0 is (op x, c1) and N1 is y. By + // default consider profitable any case where N0 has single use. This + // behavior reflects the condition replaced by this target hook call in the + // DAGCombiner. Any particular target can implement its own heuristic to + // restrict common combiner. + virtual bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, + SDValue N1) const { + return N0.hasOneUse(); + } + virtual bool isSDNodeAlwaysUniform(const SDNode * N) const { return false; } diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -1070,7 +1070,7 @@ return DAG.getNode(Opc, DL, VT, N00, OpNode); return SDValue(); } - if (N0.hasOneUse()) { + if (TLI.isReassocProfitable(DAG, N0, N1)) { // Reassociate: (op (op x, c1), y) -> (op (op x, y), c1) // iff (op x, c1) has one use if (SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N00, N1)) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -449,6 +449,11 @@ bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, LegacyDivergenceAnalysis *DA) const override; + bool hasMemSDNodeUser(SDNode *N) const; + + bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, + SDValue N1) const override; + bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth = 5) const; bool isCanonicalized(Register Reg, MachineFunction &MF, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9639,6 +9639,9 @@ SDValue SITargetLowering::performXorCombine(SDNode *N, DAGCombinerInfo &DCI) const { + if (SDValue RV = reassociateScalarOps(N, DCI.DAG)) + return RV; + EVT VT = N->getValueType(0); if (VT != MVT::i64) return SDValue(); @@ -10551,6 +10554,9 @@ if (VT != MVT::i32 && VT != MVT::i64) return SDValue(); + if (DAG.isBaseWithConstantOffset(SDValue(N, 0))) + return SDValue(); + unsigned Opc = N->getOpcode(); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); @@ -10572,12 +10578,6 @@ if (Op1->isDivergent()) std::swap(Op1, Op2); - // If either operand is constant this will conflict with - // DAGCombiner::ReassociateOps(). - if (DAG.isConstantIntBuildVectorOrConstantInt(Op0) || - DAG.isConstantIntBuildVectorOrConstantInt(Op1)) - return SDValue(); - SDLoc SL(N); SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1); return DAG.getNode(Opc, SL, VT, Add1, Op2); @@ -12578,3 +12578,27 @@ Cost.first += (Size + 255) / 256; return Cost; } + +bool SITargetLowering::hasMemSDNodeUser(SDNode *N) const { + SDNode::use_iterator I = N->use_begin(), E = N->use_end(); + for (; I != E; ++I) { + if (MemSDNode *M = dyn_cast(*I)) { + if (getBasePtrIndex(M) == I.getOperandNo()) + return true; + } + } + return false; +} + +bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0, + SDValue N1) const { + if (!N0.hasOneUse()) + return false; + // Take care of the oportunity to keep N0 uniform + if (N0->isDivergent() || !N1->isDivergent()) + return true; + // Check if we have a good chance to form the memory access pattern with the + // base and offset + return (DAG.isBaseWithConstantOffset(N0) && + hasMemSDNodeUser(*N0->use_begin())); +} diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -550,11 +550,11 @@ >; def S_XNOR_B32 : SOP2_32 <"s_xnor_b32", - [(set i32:$sdst, (not (xor_oneuse i32:$src0, i32:$src1)))] + [(set i32:$sdst, (UniformUnaryFrag (xor_oneuse i32:$src0, i32:$src1)))] >; def S_XNOR_B64 : SOP2_64 <"s_xnor_b64", - [(set i64:$sdst, (not (xor_oneuse i64:$src0, i64:$src1)))] + [(set i64:$sdst, (UniformUnaryFrag (xor_oneuse i64:$src0, i64:$src1)))] >; def S_NAND_B32 : SOP2_32 <"s_nand_b32", diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -637,9 +637,9 @@ ) >; -def : divergent_i64_BinOp ; -def : divergent_i64_BinOp ; -def : divergent_i64_BinOp ; +def : divergent_i64_BinOp ; +def : divergent_i64_BinOp ; +def : divergent_i64_BinOp ; let SubtargetPredicate = Has16BitInsts in { @@ -688,6 +688,36 @@ let isReMaterializable = 1 in defm V_XNOR_B32 : VOP2Inst <"v_xnor_b32", VOP_I32_I32_I32, xnor>; +def : GCNPat< + (i32 (DivergentUnaryFrag (xor_oneuse i32:$src0, i32:$src1))), + (i32 (V_XNOR_B32_e64 $src0, $src1)) +>; + +def : GCNPat< + (i32 (DivergentBinFrag (not i32:$src0), i32:$src1)), + (i32 (V_XNOR_B32_e64 $src0, $src1)) +>; + +def : GCNPat< + (i64 (DivergentUnaryFrag (xor_oneuse i64:$src0, i64:$src1))), + (REG_SEQUENCE VReg_64, (i32 (V_XNOR_B32_e64 + (i32 (EXTRACT_SUBREG $src0, sub0)), + (i32 (EXTRACT_SUBREG $src1, sub0)))), sub0, + (i32 (V_XNOR_B32_e64 + (i32 (EXTRACT_SUBREG $src0, sub1)), + (i32 (EXTRACT_SUBREG $src1, sub1)))), sub1) +>; + +def : GCNPat< + (i64 (DivergentBinFrag (not i64:$src0), i64:$src1)), + (REG_SEQUENCE VReg_64, (i32 (V_XNOR_B32_e64 + (i32 (EXTRACT_SUBREG $src0, sub0)), + (i32 (EXTRACT_SUBREG $src1, sub0)))), sub0, + (i32 (V_XNOR_B32_e64 + (i32 (EXTRACT_SUBREG $src0, sub1)), + (i32 (EXTRACT_SUBREG $src1, sub1)))), sub1) +>; + let Constraints = "$vdst = $src2", DisableEncoding = "$src2", isConvertibleToThreeAddress = 1, diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-xnor.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-xnor.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-xnor.ll @@ -0,0 +1,44 @@ +; RUN: llc -march=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GCN_DL %s + +; GCN-LABEL: name: uniform_xnor_i64 +; GCN: S_XNOR_B64 +define amdgpu_kernel void @uniform_xnor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { + %xor = xor i64 %a, %b + %res = xor i64 %xor, -1 + store i64 %res, i64 addrspace(1)* %out + ret void +} +; GCN-LABEL: name: divergent_xnor_i64 +; GCN: V_XOR_B32_e64 +; GCN: V_XOR_B32_e64 +; GCN: V_NOT_B32_e32 +; GCN: V_NOT_B32_e32 +; GCN_DL: V_XNOR_B32_e64 +; GCN_DL: V_XNOR_B32_e64 +define i64 @divergent_xnor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { + %xor = xor i64 %a, %b + %res = xor i64 %xor, -1 + ret i64 %res +} + +; GCN-LABEL: name: uniform_xnor_i32 +; GCN: S_XNOR_B32 +define amdgpu_kernel void @uniform_xnor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { + %xor = xor i32 %a, %b + %res = xor i32 %xor, -1 + store i32 %res, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: name: divergent_xnor_i32 +; GCN: V_XOR_B32_e64 +; GCN: V_NOT_B32_e32 +; GCN_DL: V_XNOR_B32_e64 +define i32 @divergent_xnor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { + %xor = xor i32 %a, %b + %res = xor i32 %xor, -1 + ret i32 %res +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 diff --git a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll --- a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll +++ b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll @@ -163,8 +163,8 @@ ; GCN-NEXT: v_xor_b32_e32 v1, v3, v1 ; GCN-NEXT: v_xor_b32_e32 v0, v2, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_xnor_b32_e32 v0, v0, v4 ; GCN-NEXT: v_xnor_b32_e32 v1, v1, v5 +; GCN-NEXT: v_xnor_b32_e32 v0, v0, v4 ; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] ; GCN-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/perm.new.s b/llvm/test/CodeGen/AMDGPU/perm.new.s new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/perm.new.s @@ -0,0 +1,752 @@ + .text + .section .AMDGPU.config + .long 47176 + .long 11469504 + .long 47180 + .long 132 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl lsh8_or_and ; -- Begin function lsh8_or_and + .p2align 8 + .type lsh8_or_and,@function +lsh8_or_and: ; @lsh8_or_and +; %bb.0: ; %bb + s_load_dwordx2 s[2:3], s[0:1], 0x24 + s_load_dword s0, s[0:1], 0x2c + v_lshlrev_b32_e32 v0, 2, v0 + v_mov_b32_e32 v3, 0x6050400 + s_waitcnt lgkmcnt(0) + v_mov_b32_e32 v1, s3 + v_add_u32_e32 v0, vcc, s2, v0 + v_addc_u32_e32 v1, vcc, 0, v1, vcc + flat_load_dword v2, v[0:1] + s_waitcnt vmcnt(0) + v_perm_b32 v2, v2, s0, v3 + flat_store_dword v[0:1], v2 + s_endpgm +.Lfunc_end0: + .size lsh8_or_and, .Lfunc_end0-lsh8_or_and + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 76 +; NumSgprs: 96 +; NumVgprs: 4 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 11 +; VGPRBlocks: 0 +; NumSGPRsForWavesPerEU: 96 +; NumVGPRsForWavesPerEU: 4 +; Occupancy: 8 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + .section .AMDGPU.config + .long 47176 + .long 11469504 + .long 47180 + .long 132 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl lsr24_or_and ; -- Begin function lsr24_or_and + .p2align 8 + .type lsr24_or_and,@function +lsr24_or_and: ; @lsr24_or_and +; %bb.0: ; %bb + s_load_dwordx2 s[2:3], s[0:1], 0x24 + s_load_dword s0, s[0:1], 0x2c + v_lshlrev_b32_e32 v0, 2, v0 + v_mov_b32_e32 v3, 0x7060503 + s_waitcnt lgkmcnt(0) + v_mov_b32_e32 v1, s3 + v_add_u32_e32 v0, vcc, s2, v0 + v_addc_u32_e32 v1, vcc, 0, v1, vcc + flat_load_dword v2, v[0:1] + s_waitcnt vmcnt(0) + v_perm_b32 v2, s0, v2, v3 + flat_store_dword v[0:1], v2 + s_endpgm +.Lfunc_end1: + .size lsr24_or_and, .Lfunc_end1-lsr24_or_and + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 76 +; NumSgprs: 96 +; NumVgprs: 4 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 11 +; VGPRBlocks: 0 +; NumSGPRsForWavesPerEU: 96 +; NumVGPRsForWavesPerEU: 4 +; Occupancy: 8 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + .section .AMDGPU.config + .long 47176 + .long 11469504 + .long 47180 + .long 132 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl and_or_lsr24 ; -- Begin function and_or_lsr24 + .p2align 8 + .type and_or_lsr24,@function +and_or_lsr24: ; @and_or_lsr24 +; %bb.0: ; %bb + s_load_dwordx2 s[2:3], s[0:1], 0x24 + s_load_dword s0, s[0:1], 0x2c + v_lshlrev_b32_e32 v0, 2, v0 + v_mov_b32_e32 v3, 0x7060503 + s_waitcnt lgkmcnt(0) + v_mov_b32_e32 v1, s3 + v_add_u32_e32 v0, vcc, s2, v0 + v_addc_u32_e32 v1, vcc, 0, v1, vcc + flat_load_dword v2, v[0:1] + s_waitcnt vmcnt(0) + v_perm_b32 v2, v2, s0, v3 + v_xor_b32_e32 v2, 0x80000000, v2 + flat_store_dword v[0:1], v2 + s_endpgm +.Lfunc_end2: + .size and_or_lsr24, .Lfunc_end2-and_or_lsr24 + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 84 +; NumSgprs: 96 +; NumVgprs: 4 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 11 +; VGPRBlocks: 0 +; NumSGPRsForWavesPerEU: 96 +; NumVGPRsForWavesPerEU: 4 +; Occupancy: 8 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + .section .AMDGPU.config + .long 47176 + .long 11469504 + .long 47180 + .long 132 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl and_or_and ; -- Begin function and_or_and + .p2align 8 + .type and_or_and,@function +and_or_and: ; @and_or_and +; %bb.0: ; %bb + s_load_dwordx2 s[2:3], s[0:1], 0x24 + s_load_dword s0, s[0:1], 0x2c + v_lshlrev_b32_e32 v0, 2, v0 + v_mov_b32_e32 v3, 0x7020500 + s_waitcnt lgkmcnt(0) + v_mov_b32_e32 v1, s3 + v_add_u32_e32 v0, vcc, s2, v0 + v_addc_u32_e32 v1, vcc, 0, v1, vcc + flat_load_dword v2, v[0:1] + s_waitcnt vmcnt(0) + v_perm_b32 v2, v2, s0, v3 + flat_store_dword v[0:1], v2 + s_endpgm +.Lfunc_end3: + .size and_or_and, .Lfunc_end3-and_or_and + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 76 +; NumSgprs: 96 +; NumVgprs: 4 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 11 +; VGPRBlocks: 0 +; NumSGPRsForWavesPerEU: 96 +; NumVGPRsForWavesPerEU: 4 +; Occupancy: 8 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + .section .AMDGPU.config + .long 47176 + .long 11469504 + .long 47180 + .long 132 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl lsh8_or_lsr24 ; -- Begin function lsh8_or_lsr24 + .p2align 8 + .type lsh8_or_lsr24,@function +lsh8_or_lsr24: ; @lsh8_or_lsr24 +; %bb.0: ; %bb + s_load_dwordx2 s[2:3], s[0:1], 0x24 + s_load_dword s0, s[0:1], 0x2c + v_lshlrev_b32_e32 v0, 2, v0 + s_waitcnt lgkmcnt(0) + v_mov_b32_e32 v1, s3 + v_add_u32_e32 v0, vcc, s2, v0 + v_addc_u32_e32 v1, vcc, 0, v1, vcc + flat_load_dword v2, v[0:1] + s_waitcnt vmcnt(0) + v_alignbit_b32 v2, v2, s0, 24 + flat_store_dword v[0:1], v2 + s_endpgm +.Lfunc_end4: + .size lsh8_or_lsr24, .Lfunc_end4-lsh8_or_lsr24 + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 68 +; NumSgprs: 96 +; NumVgprs: 3 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 11 +; VGPRBlocks: 0 +; NumSGPRsForWavesPerEU: 96 +; NumVGPRsForWavesPerEU: 3 +; Occupancy: 8 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + .section .AMDGPU.config + .long 47176 + .long 11469504 + .long 47180 + .long 132 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl lsh16_or_lsr24 ; -- Begin function lsh16_or_lsr24 + .p2align 8 + .type lsh16_or_lsr24,@function +lsh16_or_lsr24: ; @lsh16_or_lsr24 +; %bb.0: ; %bb + s_load_dwordx2 s[2:3], s[0:1], 0x24 + s_load_dword s0, s[0:1], 0x2c + v_lshlrev_b32_e32 v0, 2, v0 + v_mov_b32_e32 v3, 0x5040c03 + s_waitcnt lgkmcnt(0) + v_mov_b32_e32 v1, s3 + v_add_u32_e32 v0, vcc, s2, v0 + v_addc_u32_e32 v1, vcc, 0, v1, vcc + flat_load_dword v2, v[0:1] + s_waitcnt vmcnt(0) + v_perm_b32 v2, v2, s0, v3 + flat_store_dword v[0:1], v2 + s_endpgm +.Lfunc_end5: + .size lsh16_or_lsr24, .Lfunc_end5-lsh16_or_lsr24 + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 76 +; NumSgprs: 96 +; NumVgprs: 4 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 11 +; VGPRBlocks: 0 +; NumSGPRsForWavesPerEU: 96 +; NumVGPRsForWavesPerEU: 4 +; Occupancy: 8 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + .section .AMDGPU.config + .long 47176 + .long 11469504 + .long 47180 + .long 132 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl and_xor_and ; -- Begin function and_xor_and + .p2align 8 + .type and_xor_and,@function +and_xor_and: ; @and_xor_and +; %bb.0: ; %bb + s_load_dwordx2 s[2:3], s[0:1], 0x24 + s_load_dword s0, s[0:1], 0x2c + v_lshlrev_b32_e32 v0, 2, v0 + v_mov_b32_e32 v3, 0x7020104 + s_waitcnt lgkmcnt(0) + v_mov_b32_e32 v1, s3 + v_add_u32_e32 v0, vcc, s2, v0 + v_addc_u32_e32 v1, vcc, 0, v1, vcc + flat_load_dword v2, v[0:1] + s_waitcnt vmcnt(0) + v_perm_b32 v2, v2, s0, v3 + flat_store_dword v[0:1], v2 + s_endpgm +.Lfunc_end6: + .size and_xor_and, .Lfunc_end6-and_xor_and + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 76 +; NumSgprs: 96 +; NumVgprs: 4 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 11 +; VGPRBlocks: 0 +; NumSGPRsForWavesPerEU: 96 +; NumVGPRsForWavesPerEU: 4 +; Occupancy: 8 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + .section .AMDGPU.config + .long 47176 + .long 11469504 + .long 47180 + .long 132 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl and_or_or_and ; -- Begin function and_or_or_and + .p2align 8 + .type and_or_or_and,@function +and_or_or_and: ; @and_or_or_and +; %bb.0: ; %bb + s_load_dwordx2 s[2:3], s[0:1], 0x24 + s_load_dword s0, s[0:1], 0x2c + v_lshlrev_b32_e32 v0, 2, v0 + s_waitcnt lgkmcnt(0) + v_mov_b32_e32 v1, s3 + v_add_u32_e32 v0, vcc, s2, v0 + v_addc_u32_e32 v1, vcc, 0, v1, vcc + flat_load_dword v2, v[0:1] + s_and_b32 s0, s0, 0xff00 + s_or_b32 s0, s0, 0xffff0000 + s_waitcnt vmcnt(0) + v_and_b32_e32 v2, 0xff00ff, v2 + v_or_b32_e32 v2, s0, v2 + flat_store_dword v[0:1], v2 + s_endpgm +.Lfunc_end7: + .size and_or_or_and, .Lfunc_end7-and_or_or_and + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 88 +; NumSgprs: 96 +; NumVgprs: 3 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 11 +; VGPRBlocks: 0 +; NumSGPRsForWavesPerEU: 96 +; NumVGPRsForWavesPerEU: 3 +; Occupancy: 8 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + .section .AMDGPU.config + .long 47176 + .long 11469504 + .long 47180 + .long 132 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl and_or_and_shl ; -- Begin function and_or_and_shl + .p2align 8 + .type and_or_and_shl,@function +and_or_and_shl: ; @and_or_and_shl +; %bb.0: ; %bb + s_load_dwordx2 s[2:3], s[0:1], 0x24 + s_load_dword s0, s[0:1], 0x2c + v_lshlrev_b32_e32 v0, 2, v0 + v_mov_b32_e32 v3, 0x50c0c00 + s_waitcnt lgkmcnt(0) + v_mov_b32_e32 v1, s3 + v_add_u32_e32 v0, vcc, s2, v0 + v_addc_u32_e32 v1, vcc, 0, v1, vcc + flat_load_dword v2, v[0:1] + s_waitcnt vmcnt(0) + v_perm_b32 v2, v2, s0, v3 + flat_store_dword v[0:1], v2 + s_endpgm +.Lfunc_end8: + .size and_or_and_shl, .Lfunc_end8-and_or_and_shl + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 76 +; NumSgprs: 96 +; NumVgprs: 4 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 11 +; VGPRBlocks: 0 +; NumSGPRsForWavesPerEU: 96 +; NumVGPRsForWavesPerEU: 4 +; Occupancy: 8 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + .section .AMDGPU.config + .long 47176 + .long 11469504 + .long 47180 + .long 132 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl or_and_or ; -- Begin function or_and_or + .p2align 8 + .type or_and_or,@function +or_and_or: ; @or_and_or +; %bb.0: ; %bb + s_load_dwordx2 s[2:3], s[0:1], 0x24 + s_load_dword s0, s[0:1], 0x2c + v_lshlrev_b32_e32 v0, 2, v0 + v_mov_b32_e32 v3, 0x7020104 + s_waitcnt lgkmcnt(0) + v_mov_b32_e32 v1, s3 + v_add_u32_e32 v0, vcc, s2, v0 + v_addc_u32_e32 v1, vcc, 0, v1, vcc + flat_load_dword v2, v[0:1] + s_waitcnt vmcnt(0) + v_perm_b32 v2, v2, s0, v3 + flat_store_dword v[0:1], v2 + s_endpgm +.Lfunc_end9: + .size or_and_or, .Lfunc_end9-or_and_or + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 76 +; NumSgprs: 96 +; NumVgprs: 4 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 11 +; VGPRBlocks: 0 +; NumSGPRsForWavesPerEU: 96 +; NumVGPRsForWavesPerEU: 4 +; Occupancy: 8 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + .section .AMDGPU.config + .long 47176 + .long 11469505 + .long 47180 + .long 132 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl known_ffff0500 ; -- Begin function known_ffff0500 + .p2align 8 + .type known_ffff0500,@function +known_ffff0500: ; @known_ffff0500 +; %bb.0: ; %bb + s_load_dwordx2 s[2:3], s[0:1], 0x24 + s_load_dword s0, s[0:1], 0x2c + v_lshlrev_b32_e32 v0, 2, v0 + v_mov_b32_e32 v5, 0xffff8004 + s_waitcnt lgkmcnt(0) + v_mov_b32_e32 v1, s3 + v_add_u32_e32 v0, vcc, s2, v0 + v_addc_u32_e32 v1, vcc, 0, v1, vcc + flat_load_dword v4, v[0:1] + s_bitset1_b32 s0, 15 + s_and_b32 s0, s0, 0xff00 + s_or_b32 s0, s0, 0xffff0000 + v_mov_b32_e32 v2, s2 + v_mov_b32_e32 v3, s3 + s_waitcnt vmcnt(0) + v_or_b32_e32 v4, 4, v4 + v_and_b32_e32 v4, 0xff00ff, v4 + v_or_b32_e32 v4, s0, v4 + flat_store_dword v[0:1], v4 + flat_store_dword v[2:3], v5 + s_endpgm +.Lfunc_end10: + .size known_ffff0500, .Lfunc_end10-known_ffff0500 + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 120 +; NumSgprs: 96 +; NumVgprs: 6 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 11 +; VGPRBlocks: 1 +; NumSGPRsForWavesPerEU: 96 +; NumVGPRsForWavesPerEU: 6 +; Occupancy: 8 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + .section .AMDGPU.config + .long 47176 + .long 11469505 + .long 47180 + .long 132 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl known_050c0c00 ; -- Begin function known_050c0c00 + .p2align 8 + .type known_050c0c00,@function +known_050c0c00: ; @known_050c0c00 +; %bb.0: ; %bb + s_load_dwordx2 s[2:3], s[0:1], 0x24 + s_load_dword s0, s[0:1], 0x2c + v_lshlrev_b32_e32 v0, 2, v0 + v_mov_b32_e32 v5, 0x50c0c00 + v_mov_b32_e32 v6, 4 + s_waitcnt lgkmcnt(0) + v_mov_b32_e32 v1, s3 + v_add_u32_e32 v0, vcc, s2, v0 + v_addc_u32_e32 v1, vcc, 0, v1, vcc + flat_load_dword v4, v[0:1] + s_or_b32 s0, s0, 4 + v_mov_b32_e32 v2, s2 + v_mov_b32_e32 v3, s3 + s_waitcnt vmcnt(0) + v_perm_b32 v4, v4, s0, v5 + flat_store_dword v[0:1], v4 + flat_store_dword v[2:3], v6 + s_endpgm +.Lfunc_end11: + .size known_050c0c00, .Lfunc_end11-known_050c0c00 + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 100 +; NumSgprs: 96 +; NumVgprs: 7 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 11 +; VGPRBlocks: 1 +; NumSGPRsForWavesPerEU: 96 +; NumVGPRsForWavesPerEU: 7 +; Occupancy: 8 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + .section .AMDGPU.config + .long 47176 + .long 11469505 + .long 47180 + .long 132 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl known_ffff8004 ; -- Begin function known_ffff8004 + .p2align 8 + .type known_ffff8004,@function +known_ffff8004: ; @known_ffff8004 +; %bb.0: ; %bb + s_load_dwordx2 s[2:3], s[0:1], 0x24 + s_load_dword s0, s[0:1], 0x2c + v_lshlrev_b32_e32 v0, 2, v0 + v_mov_b32_e32 v5, 0xffff0500 + v_mov_b32_e32 v6, 0xffff8004 + s_waitcnt lgkmcnt(0) + v_mov_b32_e32 v1, s3 + v_add_u32_e32 v0, vcc, s2, v0 + v_addc_u32_e32 v1, vcc, 0, v1, vcc + flat_load_dword v4, v[0:1] + s_or_b32 s0, s0, 4 + v_mov_b32_e32 v2, s2 + v_mov_b32_e32 v3, s3 + s_waitcnt vmcnt(0) + v_or_b32_e32 v4, 0x8000, v4 + v_perm_b32 v4, v4, s0, v5 + flat_store_dword v[0:1], v4 + flat_store_dword v[2:3], v6 + s_endpgm +.Lfunc_end12: + .size known_ffff8004, .Lfunc_end12-known_ffff8004 + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 112 +; NumSgprs: 96 +; NumVgprs: 7 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 11 +; VGPRBlocks: 1 +; NumSGPRsForWavesPerEU: 96 +; NumVGPRsForWavesPerEU: 7 +; Occupancy: 8 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + .section ".note.GNU-stack" + .amd_amdgpu_isa "amdgcn-unknown-linux-gnu-gfx802" diff --git a/llvm/test/CodeGen/AMDGPU/perm.old.s b/llvm/test/CodeGen/AMDGPU/perm.old.s new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/perm.old.s @@ -0,0 +1,748 @@ + .text + .section .AMDGPU.config + .long 47176 + .long 11469504 + .long 47180 + .long 132 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl lsh8_or_and ; -- Begin function lsh8_or_and + .p2align 8 + .type lsh8_or_and,@function +lsh8_or_and: ; @lsh8_or_and +; %bb.0: ; %bb + s_load_dwordx2 s[2:3], s[0:1], 0x24 + s_load_dword s0, s[0:1], 0x2c + v_lshlrev_b32_e32 v0, 2, v0 + v_mov_b32_e32 v3, 0x6050400 + s_waitcnt lgkmcnt(0) + v_mov_b32_e32 v1, s3 + v_add_u32_e32 v0, vcc, s2, v0 + v_addc_u32_e32 v1, vcc, 0, v1, vcc + flat_load_dword v2, v[0:1] + s_waitcnt vmcnt(0) + v_perm_b32 v2, v2, s0, v3 + flat_store_dword v[0:1], v2 + s_endpgm +.Lfunc_end0: + .size lsh8_or_and, .Lfunc_end0-lsh8_or_and + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 76 +; NumSgprs: 96 +; NumVgprs: 4 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 11 +; VGPRBlocks: 0 +; NumSGPRsForWavesPerEU: 96 +; NumVGPRsForWavesPerEU: 4 +; Occupancy: 8 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + .section .AMDGPU.config + .long 47176 + .long 11469504 + .long 47180 + .long 132 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl lsr24_or_and ; -- Begin function lsr24_or_and + .p2align 8 + .type lsr24_or_and,@function +lsr24_or_and: ; @lsr24_or_and +; %bb.0: ; %bb + s_load_dwordx2 s[2:3], s[0:1], 0x24 + s_load_dword s0, s[0:1], 0x2c + v_lshlrev_b32_e32 v0, 2, v0 + v_mov_b32_e32 v3, 0x7060503 + s_waitcnt lgkmcnt(0) + v_mov_b32_e32 v1, s3 + v_add_u32_e32 v0, vcc, s2, v0 + v_addc_u32_e32 v1, vcc, 0, v1, vcc + flat_load_dword v2, v[0:1] + s_waitcnt vmcnt(0) + v_perm_b32 v2, s0, v2, v3 + flat_store_dword v[0:1], v2 + s_endpgm +.Lfunc_end1: + .size lsr24_or_and, .Lfunc_end1-lsr24_or_and + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 76 +; NumSgprs: 96 +; NumVgprs: 4 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 11 +; VGPRBlocks: 0 +; NumSGPRsForWavesPerEU: 96 +; NumVGPRsForWavesPerEU: 4 +; Occupancy: 8 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + .section .AMDGPU.config + .long 47176 + .long 11469504 + .long 47180 + .long 132 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl and_or_lsr24 ; -- Begin function and_or_lsr24 + .p2align 8 + .type and_or_lsr24,@function +and_or_lsr24: ; @and_or_lsr24 +; %bb.0: ; %bb + s_load_dwordx2 s[2:3], s[0:1], 0x24 + s_load_dword s0, s[0:1], 0x2c + v_lshlrev_b32_e32 v0, 2, v0 + v_mov_b32_e32 v3, 0x7060503 + s_waitcnt lgkmcnt(0) + v_mov_b32_e32 v1, s3 + v_add_u32_e32 v0, vcc, s2, v0 + v_addc_u32_e32 v1, vcc, 0, v1, vcc + flat_load_dword v2, v[0:1] + s_waitcnt vmcnt(0) + v_perm_b32 v2, v2, s0, v3 + v_xor_b32_e32 v2, 0x80000000, v2 + flat_store_dword v[0:1], v2 + s_endpgm +.Lfunc_end2: + .size and_or_lsr24, .Lfunc_end2-and_or_lsr24 + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 84 +; NumSgprs: 96 +; NumVgprs: 4 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 11 +; VGPRBlocks: 0 +; NumSGPRsForWavesPerEU: 96 +; NumVGPRsForWavesPerEU: 4 +; Occupancy: 8 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + .section .AMDGPU.config + .long 47176 + .long 11469504 + .long 47180 + .long 132 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl and_or_and ; -- Begin function and_or_and + .p2align 8 + .type and_or_and,@function +and_or_and: ; @and_or_and +; %bb.0: ; %bb + s_load_dwordx2 s[2:3], s[0:1], 0x24 + s_load_dword s0, s[0:1], 0x2c + v_lshlrev_b32_e32 v0, 2, v0 + v_mov_b32_e32 v3, 0x7020500 + s_waitcnt lgkmcnt(0) + v_mov_b32_e32 v1, s3 + v_add_u32_e32 v0, vcc, s2, v0 + v_addc_u32_e32 v1, vcc, 0, v1, vcc + flat_load_dword v2, v[0:1] + s_waitcnt vmcnt(0) + v_perm_b32 v2, v2, s0, v3 + flat_store_dword v[0:1], v2 + s_endpgm +.Lfunc_end3: + .size and_or_and, .Lfunc_end3-and_or_and + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 76 +; NumSgprs: 96 +; NumVgprs: 4 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 11 +; VGPRBlocks: 0 +; NumSGPRsForWavesPerEU: 96 +; NumVGPRsForWavesPerEU: 4 +; Occupancy: 8 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + .section .AMDGPU.config + .long 47176 + .long 11469504 + .long 47180 + .long 132 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl lsh8_or_lsr24 ; -- Begin function lsh8_or_lsr24 + .p2align 8 + .type lsh8_or_lsr24,@function +lsh8_or_lsr24: ; @lsh8_or_lsr24 +; %bb.0: ; %bb + s_load_dwordx2 s[2:3], s[0:1], 0x24 + s_load_dword s0, s[0:1], 0x2c + v_lshlrev_b32_e32 v0, 2, v0 + s_waitcnt lgkmcnt(0) + v_mov_b32_e32 v1, s3 + v_add_u32_e32 v0, vcc, s2, v0 + v_addc_u32_e32 v1, vcc, 0, v1, vcc + flat_load_dword v2, v[0:1] + s_waitcnt vmcnt(0) + v_alignbit_b32 v2, v2, s0, 24 + flat_store_dword v[0:1], v2 + s_endpgm +.Lfunc_end4: + .size lsh8_or_lsr24, .Lfunc_end4-lsh8_or_lsr24 + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 68 +; NumSgprs: 96 +; NumVgprs: 3 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 11 +; VGPRBlocks: 0 +; NumSGPRsForWavesPerEU: 96 +; NumVGPRsForWavesPerEU: 3 +; Occupancy: 8 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + .section .AMDGPU.config + .long 47176 + .long 11469504 + .long 47180 + .long 132 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl lsh16_or_lsr24 ; -- Begin function lsh16_or_lsr24 + .p2align 8 + .type lsh16_or_lsr24,@function +lsh16_or_lsr24: ; @lsh16_or_lsr24 +; %bb.0: ; %bb + s_load_dwordx2 s[2:3], s[0:1], 0x24 + s_load_dword s0, s[0:1], 0x2c + v_lshlrev_b32_e32 v0, 2, v0 + v_mov_b32_e32 v3, 0x5040c03 + s_waitcnt lgkmcnt(0) + v_mov_b32_e32 v1, s3 + v_add_u32_e32 v0, vcc, s2, v0 + v_addc_u32_e32 v1, vcc, 0, v1, vcc + flat_load_dword v2, v[0:1] + s_waitcnt vmcnt(0) + v_perm_b32 v2, v2, s0, v3 + flat_store_dword v[0:1], v2 + s_endpgm +.Lfunc_end5: + .size lsh16_or_lsr24, .Lfunc_end5-lsh16_or_lsr24 + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 76 +; NumSgprs: 96 +; NumVgprs: 4 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 11 +; VGPRBlocks: 0 +; NumSGPRsForWavesPerEU: 96 +; NumVGPRsForWavesPerEU: 4 +; Occupancy: 8 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + .section .AMDGPU.config + .long 47176 + .long 11469504 + .long 47180 + .long 132 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl and_xor_and ; -- Begin function and_xor_and + .p2align 8 + .type and_xor_and,@function +and_xor_and: ; @and_xor_and +; %bb.0: ; %bb + s_load_dwordx2 s[2:3], s[0:1], 0x24 + s_load_dword s0, s[0:1], 0x2c + v_lshlrev_b32_e32 v0, 2, v0 + v_mov_b32_e32 v3, 0x7020104 + s_waitcnt lgkmcnt(0) + v_mov_b32_e32 v1, s3 + v_add_u32_e32 v0, vcc, s2, v0 + v_addc_u32_e32 v1, vcc, 0, v1, vcc + flat_load_dword v2, v[0:1] + s_waitcnt vmcnt(0) + v_perm_b32 v2, v2, s0, v3 + flat_store_dword v[0:1], v2 + s_endpgm +.Lfunc_end6: + .size and_xor_and, .Lfunc_end6-and_xor_and + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 76 +; NumSgprs: 96 +; NumVgprs: 4 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 11 +; VGPRBlocks: 0 +; NumSGPRsForWavesPerEU: 96 +; NumVGPRsForWavesPerEU: 4 +; Occupancy: 8 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + .section .AMDGPU.config + .long 47176 + .long 11469504 + .long 47180 + .long 132 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl and_or_or_and ; -- Begin function and_or_or_and + .p2align 8 + .type and_or_or_and,@function +and_or_or_and: ; @and_or_or_and +; %bb.0: ; %bb + s_load_dwordx2 s[2:3], s[0:1], 0x24 + s_load_dword s0, s[0:1], 0x2c + v_lshlrev_b32_e32 v0, 2, v0 + v_mov_b32_e32 v3, 0xffff0500 + s_waitcnt lgkmcnt(0) + v_mov_b32_e32 v1, s3 + v_add_u32_e32 v0, vcc, s2, v0 + v_addc_u32_e32 v1, vcc, 0, v1, vcc + flat_load_dword v2, v[0:1] + s_waitcnt vmcnt(0) + v_perm_b32 v2, s0, v2, v3 + flat_store_dword v[0:1], v2 + s_endpgm +.Lfunc_end7: + .size and_or_or_and, .Lfunc_end7-and_or_or_and + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 76 +; NumSgprs: 96 +; NumVgprs: 4 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 11 +; VGPRBlocks: 0 +; NumSGPRsForWavesPerEU: 96 +; NumVGPRsForWavesPerEU: 4 +; Occupancy: 8 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + .section .AMDGPU.config + .long 47176 + .long 11469504 + .long 47180 + .long 132 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl and_or_and_shl ; -- Begin function and_or_and_shl + .p2align 8 + .type and_or_and_shl,@function +and_or_and_shl: ; @and_or_and_shl +; %bb.0: ; %bb + s_load_dwordx2 s[2:3], s[0:1], 0x24 + s_load_dword s0, s[0:1], 0x2c + v_lshlrev_b32_e32 v0, 2, v0 + v_mov_b32_e32 v3, 0x50c0c00 + s_waitcnt lgkmcnt(0) + v_mov_b32_e32 v1, s3 + v_add_u32_e32 v0, vcc, s2, v0 + v_addc_u32_e32 v1, vcc, 0, v1, vcc + flat_load_dword v2, v[0:1] + s_waitcnt vmcnt(0) + v_perm_b32 v2, v2, s0, v3 + flat_store_dword v[0:1], v2 + s_endpgm +.Lfunc_end8: + .size and_or_and_shl, .Lfunc_end8-and_or_and_shl + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 76 +; NumSgprs: 96 +; NumVgprs: 4 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 11 +; VGPRBlocks: 0 +; NumSGPRsForWavesPerEU: 96 +; NumVGPRsForWavesPerEU: 4 +; Occupancy: 8 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + .section .AMDGPU.config + .long 47176 + .long 11469504 + .long 47180 + .long 132 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl or_and_or ; -- Begin function or_and_or + .p2align 8 + .type or_and_or,@function +or_and_or: ; @or_and_or +; %bb.0: ; %bb + s_load_dwordx2 s[2:3], s[0:1], 0x24 + s_load_dword s0, s[0:1], 0x2c + v_lshlrev_b32_e32 v0, 2, v0 + v_mov_b32_e32 v3, 0x7020104 + s_waitcnt lgkmcnt(0) + v_mov_b32_e32 v1, s3 + v_add_u32_e32 v0, vcc, s2, v0 + v_addc_u32_e32 v1, vcc, 0, v1, vcc + flat_load_dword v2, v[0:1] + s_waitcnt vmcnt(0) + v_perm_b32 v2, v2, s0, v3 + flat_store_dword v[0:1], v2 + s_endpgm +.Lfunc_end9: + .size or_and_or, .Lfunc_end9-or_and_or + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 76 +; NumSgprs: 96 +; NumVgprs: 4 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 11 +; VGPRBlocks: 0 +; NumSGPRsForWavesPerEU: 96 +; NumVGPRsForWavesPerEU: 4 +; Occupancy: 8 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + .section .AMDGPU.config + .long 47176 + .long 11469505 + .long 47180 + .long 132 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl known_ffff0500 ; -- Begin function known_ffff0500 + .p2align 8 + .type known_ffff0500,@function +known_ffff0500: ; @known_ffff0500 +; %bb.0: ; %bb + s_load_dwordx2 s[2:3], s[0:1], 0x24 + s_load_dword s0, s[0:1], 0x2c + v_lshlrev_b32_e32 v0, 2, v0 + v_mov_b32_e32 v5, 0xffff0500 + v_mov_b32_e32 v6, 0xffff8004 + s_waitcnt lgkmcnt(0) + v_mov_b32_e32 v1, s3 + v_add_u32_e32 v0, vcc, s2, v0 + v_addc_u32_e32 v1, vcc, 0, v1, vcc + flat_load_dword v4, v[0:1] + s_bitset1_b32 s0, 15 + v_mov_b32_e32 v2, s2 + v_mov_b32_e32 v3, s3 + s_waitcnt vmcnt(0) + v_or_b32_e32 v4, 4, v4 + v_perm_b32 v4, s0, v4, v5 + flat_store_dword v[0:1], v4 + flat_store_dword v[2:3], v6 + s_endpgm +.Lfunc_end10: + .size known_ffff0500, .Lfunc_end10-known_ffff0500 + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 108 +; NumSgprs: 96 +; NumVgprs: 7 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 11 +; VGPRBlocks: 1 +; NumSGPRsForWavesPerEU: 96 +; NumVGPRsForWavesPerEU: 7 +; Occupancy: 8 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + .section .AMDGPU.config + .long 47176 + .long 11469505 + .long 47180 + .long 132 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl known_050c0c00 ; -- Begin function known_050c0c00 + .p2align 8 + .type known_050c0c00,@function +known_050c0c00: ; @known_050c0c00 +; %bb.0: ; %bb + s_load_dwordx2 s[2:3], s[0:1], 0x24 + s_load_dword s0, s[0:1], 0x2c + v_lshlrev_b32_e32 v0, 2, v0 + v_mov_b32_e32 v5, 0x50c0c00 + v_mov_b32_e32 v6, 4 + s_waitcnt lgkmcnt(0) + v_mov_b32_e32 v1, s3 + v_add_u32_e32 v0, vcc, s2, v0 + v_addc_u32_e32 v1, vcc, 0, v1, vcc + flat_load_dword v4, v[0:1] + s_or_b32 s0, s0, 4 + v_mov_b32_e32 v2, s2 + v_mov_b32_e32 v3, s3 + s_waitcnt vmcnt(0) + v_perm_b32 v4, v4, s0, v5 + flat_store_dword v[0:1], v4 + flat_store_dword v[2:3], v6 + s_endpgm +.Lfunc_end11: + .size known_050c0c00, .Lfunc_end11-known_050c0c00 + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 100 +; NumSgprs: 96 +; NumVgprs: 7 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 11 +; VGPRBlocks: 1 +; NumSGPRsForWavesPerEU: 96 +; NumVGPRsForWavesPerEU: 7 +; Occupancy: 8 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + .section .AMDGPU.config + .long 47176 + .long 11469505 + .long 47180 + .long 132 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl known_ffff8004 ; -- Begin function known_ffff8004 + .p2align 8 + .type known_ffff8004,@function +known_ffff8004: ; @known_ffff8004 +; %bb.0: ; %bb + s_load_dwordx2 s[2:3], s[0:1], 0x24 + s_load_dword s0, s[0:1], 0x2c + v_lshlrev_b32_e32 v0, 2, v0 + v_mov_b32_e32 v5, 0xffff0500 + v_mov_b32_e32 v6, 0xffff8004 + s_waitcnt lgkmcnt(0) + v_mov_b32_e32 v1, s3 + v_add_u32_e32 v0, vcc, s2, v0 + v_addc_u32_e32 v1, vcc, 0, v1, vcc + flat_load_dword v4, v[0:1] + s_or_b32 s0, s0, 4 + v_mov_b32_e32 v2, s2 + v_mov_b32_e32 v3, s3 + s_waitcnt vmcnt(0) + v_or_b32_e32 v4, 0x8000, v4 + v_perm_b32 v4, v4, s0, v5 + flat_store_dword v[0:1], v4 + flat_store_dword v[2:3], v6 + s_endpgm +.Lfunc_end12: + .size known_ffff8004, .Lfunc_end12-known_ffff8004 + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 112 +; NumSgprs: 96 +; NumVgprs: 7 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 11 +; VGPRBlocks: 1 +; NumSGPRsForWavesPerEU: 96 +; NumVGPRsForWavesPerEU: 7 +; Occupancy: 8 +; WaveLimiterHint : 1 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + .section ".note.GNU-stack" + .amd_amdgpu_isa "amdgcn-unknown-linux-gnu-gfx802" diff --git a/llvm/test/CodeGen/AMDGPU/permute.ll b/llvm/test/CodeGen/AMDGPU/permute.ll --- a/llvm/test/CodeGen/AMDGPU/permute.ll +++ b/llvm/test/CodeGen/AMDGPU/permute.ll @@ -106,8 +106,10 @@ } ; GCN-LABEL: {{^}}and_or_or_and: -; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff0500 -; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]] +; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xff00 +; GCN: s_or_b32 [[SREG:s[0-9]+]], s{{[0-9]+}}, 0xffff0000 +; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 0xff00ff, v{{[0-9]+}} +; GCN: v_or_b32_e32 v{{[0-9]+}}, [[SREG]], [[VREG]] define amdgpu_kernel void @and_or_or_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) { bb: %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -153,9 +155,12 @@ } ; GCN-LABEL: {{^}}known_ffff0500: -; GCN-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff0500 -; GCN-DAG: v_mov_b32_e32 [[RES:v[0-9]+]], 0xffff8004 -; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]] +; GCN: v_mov_b32_e32 [[RES:v[0-9]+]], 0xffff8004 +; GCN: s_and_b32 [[SREG:s[0-9]+]], [[SREG]], 0xff00 +; GCN: s_or_b32 [[SREG]], [[SREG]], 0xffff0000 +; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 0xff00ff, [[VREG]] +; GCN: v_or_b32_e32 [[VREG]], [[SREG]], [[VREG]] +; GCN: store_dword v[{{[0-9:]+}}], [[VREG]]{{$}} ; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}} define amdgpu_kernel void @known_ffff0500(i32 addrspace(1)* nocapture %arg, i32 %arg1) { bb: diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -472,10 +472,10 @@ ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 -; GFX9-O0-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 -; GFX9-O0-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX9-O0-NEXT: v_or_b32_e64 v6, v1, v2 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 diff --git a/llvm/test/CodeGen/AMDGPU/xnor.ll b/llvm/test/CodeGen/AMDGPU/xnor.ll --- a/llvm/test/CodeGen/AMDGPU/xnor.ll +++ b/llvm/test/CodeGen/AMDGPU/xnor.ll @@ -61,8 +61,8 @@ ; GCN-LABEL: {{^}}vector_xnor_i32_one_use ; GCN-NOT: s_xnor_b32 -; GCN: v_not_b32 ; GCN: v_xor_b32 +; GCN: v_not_b32 ; GCN-DL: v_xnor_b32 define i32 @vector_xnor_i32_one_use(i32 %a, i32 %b) { entry: @@ -73,10 +73,10 @@ ; GCN-LABEL: {{^}}vector_xnor_i64_one_use ; GCN-NOT: s_xnor_b64 -; GCN: v_not_b32 -; GCN: v_not_b32 ; GCN: v_xor_b32 ; GCN: v_xor_b32 +; GCN: v_not_b32 +; GCN: v_not_b32 ; GCN-DL: v_xnor_b32 ; GCN-DL: v_xnor_b32 define i64 @vector_xnor_i64_one_use(i64 %a, i64 %b) { @@ -150,8 +150,8 @@ ; GCN-LABEL: {{^}}vector_xor_na_b_i32_one_use ; GCN-NOT: s_xnor_b32 -; GCN: v_not_b32 ; GCN: v_xor_b32 +; GCN: v_not_b32 ; GCN-DL: v_xnor_b32 define i32 @vector_xor_na_b_i32_one_use(i32 %a, i32 %b) { entry: @@ -162,8 +162,8 @@ ; GCN-LABEL: {{^}}vector_xor_a_nb_i32_one_use ; GCN-NOT: s_xnor_b32 -; GCN: v_not_b32 ; GCN: v_xor_b32 +; GCN: v_not_b32 ; GCN-DL: v_xnor_b32 define i32 @vector_xor_a_nb_i32_one_use(i32 %a, i32 %b) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/xor3.ll b/llvm/test/CodeGen/AMDGPU/xor3.ll --- a/llvm/test/CodeGen/AMDGPU/xor3.ll +++ b/llvm/test/CodeGen/AMDGPU/xor3.ll @@ -26,13 +26,13 @@ define amdgpu_ps float @xor3_vgpr_b(i32 inreg %a, i32 %b, i32 inreg %c) { ; GFX9-LABEL: xor3_vgpr_b: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX9-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX9-NEXT: s_xor_b32 s0, s3, s2 +; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: xor3_vgpr_b: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_xor3_b32 v0, s2, v0, s3 +; GFX10-NEXT: v_xor3_b32 v0, s3, s2, v0 ; GFX10-NEXT: ; return to shader part epilog %x = xor i32 %a, %b %result = xor i32 %x, %c