Index: llvm/lib/Analysis/ConstantFolding.cpp =================================================================== --- llvm/lib/Analysis/ConstantFolding.cpp +++ llvm/lib/Analysis/ConstantFolding.cpp @@ -1470,6 +1470,7 @@ case Intrinsic::vector_reduce_umin: case Intrinsic::vector_reduce_umax: // Target intrinsics + case Intrinsic::amdgcn_perm: case Intrinsic::arm_mve_vctp8: case Intrinsic::arm_mve_vctp16: case Intrinsic::arm_mve_vctp32: @@ -2702,6 +2703,46 @@ } } +static Constant *ConstantFoldAMDGCNPermIntrinsic(ArrayRef Operands, + Type *Ty) { + const APInt *C0, *C1, *C2; + if (!getConstIntOrUndef(Operands[0], C0) || + !getConstIntOrUndef(Operands[1], C1) || + !getConstIntOrUndef(Operands[2], C2)) + return nullptr; + + if (!C2) + return UndefValue::get(Ty); + + APInt Val(32, 0); + unsigned NumUndefBytes = 0; + for (unsigned I = 0; I < 32; I += 8) { + unsigned Sel = C2->extractBitsAsZExtValue(8, I); + unsigned B = 0; + + if (Sel >= 13) + B = 0xff; + else if (Sel == 12) + B = 0x00; + else { + const APInt *Src = ((Sel & 10) == 10 || (Sel & 12) == 4) ? C0 : C1; + if (!Src) + ++NumUndefBytes; + else if (Sel < 8) + B = Src->extractBitsAsZExtValue(8, (Sel & 3) * 8); + else + B = Src->extractBitsAsZExtValue(1, (Sel & 1) ? 31 : 15) * 0xff; + } + + Val.insertBits(B, I, 8); + } + + if (NumUndefBytes == 4) + return UndefValue::get(Ty); + + return ConstantInt::get(Ty, Val); +} + static Constant *ConstantFoldScalarCall3(StringRef Name, Intrinsic::ID IntrinsicID, Type *Ty, @@ -2817,6 +2858,9 @@ return ConstantInt::get(Ty, C0->shl(ShlAmt) | C1->lshr(LshrAmt)); } + if (IntrinsicID == Intrinsic::amdgcn_perm) + return ConstantFoldAMDGCNPermIntrinsic(Operands, Ty); + return nullptr; } Index: llvm/test/Transforms/InstSimplify/ConstProp/AMDGPU/perm.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/InstSimplify/ConstProp/AMDGPU/perm.ll @@ -0,0 +1,102 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instsimplify -S | FileCheck %s + +declare i32 @llvm.amdgcn.perm(i32, i32, i32) + +; src1 = 0x19203a4b (421542475), src2 = 0x5c6d7e8f (1550679695) +define void @test(i32* %p) { +; CHECK-LABEL: @test( +; CHECK-NEXT: store volatile i32 undef, i32* [[P:%.*]], align 4 +; CHECK-NEXT: store volatile i32 -1887539876, i32* [[P]], align 4 +; CHECK-NEXT: store volatile i32 2121096267, i32* [[P]], align 4 +; CHECK-NEXT: store volatile i32 1262100505, i32* [[P]], align 4 +; CHECK-NEXT: store volatile i32 1550679695, i32* [[P]], align 4 +; CHECK-NEXT: store volatile i32 421542475, i32* [[P]], align 4 +; CHECK-NEXT: store volatile i32 545143439, i32* [[P]], align 4 +; CHECK-NEXT: store volatile i32 16711935, i32* [[P]], align 4 +; CHECK-NEXT: store volatile i32 16711935, i32* [[P]], align 4 +; CHECK-NEXT: store volatile i32 436174336, i32* [[P]], align 4 +; CHECK-NEXT: store volatile i32 16711680, i32* [[P]], align 4 +; CHECK-NEXT: store volatile i32 -1, i32* [[P]], align 4 +; CHECK-NEXT: store volatile i32 -1, i32* [[P]], align 4 +; CHECK-NEXT: store volatile i32 -1, i32* [[P]], align 4 +; CHECK-NEXT: store volatile i32 undef, i32* [[P]], align 4 +; CHECK-NEXT: store volatile i32 421542475, i32* [[P]], align 4 +; CHECK-NEXT: store volatile i32 1550679695, i32* [[P]], align 4 +; CHECK-NEXT: store volatile i32 undef, i32* [[P]], align 4 +; CHECK-NEXT: store volatile i32 143, i32* [[P]], align 4 +; CHECK-NEXT: store volatile i32 0, i32* [[P]], align 4 +; CHECK-NEXT: store volatile i32 255, i32* [[P]], align 4 +; CHECK-NEXT: store volatile i32 1550679552, i32* [[P]], align 4 +; CHECK-NEXT: store volatile i32 75, i32* [[P]], align 4 +; CHECK-NEXT: store volatile i32 0, i32* [[P]], align 4 +; CHECK-NEXT: store volatile i32 255, i32* [[P]], align 4 +; CHECK-NEXT: store volatile i32 65535, i32* [[P]], align 4 +; CHECK-NEXT: store volatile i32 421542400, i32* [[P]], align 4 +; CHECK-NEXT: store volatile i32 -16776961, i32* [[P]], align 4 +; CHECK-NEXT: store volatile i32 255, i32* [[P]], align 4 +; CHECK-NEXT: store volatile i32 -16777216, i32* [[P]], align 4 +; CHECK-NEXT: ret void +; + %s1s2_u = call i32 @llvm.amdgcn.perm(i32 421542475, i32 1550679695, i32 undef) + store volatile i32 %s1s2_u, i32* %p + %s1s2_0x00010203 = call i32 @llvm.amdgcn.perm(i32 421542475, i32 1550679695, i32 66051) + store volatile i32 %s1s2_0x00010203, i32* %p + %s1s2_0x01020304 = call i32 @llvm.amdgcn.perm(i32 421542475, i32 1550679695, i32 16909060) + store volatile i32 %s1s2_0x01020304, i32* %p + %s1s2_0x04050607 = call i32 @llvm.amdgcn.perm(i32 421542475, i32 1550679695, i32 67438087) + store volatile i32 %s1s2_0x04050607, i32* %p + %s1s2_0x03020100 = call i32 @llvm.amdgcn.perm(i32 421542475, i32 1550679695, i32 50462976) + store volatile i32 %s1s2_0x03020100, i32* %p + %s1s2_0x07060504 = call i32 @llvm.amdgcn.perm(i32 421542475, i32 1550679695, i32 117835012) + store volatile i32 %s1s2_0x07060504, i32* %p + %s1s2_0x06010500 = call i32 @llvm.amdgcn.perm(i32 421542475, i32 1550679695, i32 100730112) + store volatile i32 %s1s2_0x06010500, i32* %p + %s1s2_0x0c0f0c0f = call i32 @llvm.amdgcn.perm(i32 421542475, i32 1550679695, i32 202312719) + store volatile i32 %s1s2_0x0c0f0c0f, i32* %p + %u1u2_0x0c0f0c0f = call i32 @llvm.amdgcn.perm(i32 undef, i32 undef, i32 202312719) + store volatile i32 %u1u2_0x0c0f0c0f, i32* %p + %s1s2_0x070d010c = call i32 @llvm.amdgcn.perm(i32 421542475, i32 1550679695, i32 118292748) + store volatile i32 %s1s2_0x070d010c, i32* %p + %u1u2_0x070d010c = call i32 @llvm.amdgcn.perm(i32 undef, i32 undef, i32 118292748) + store volatile i32 %u1u2_0x070d010c, i32* %p + %s1s2_0x80818283 = call i32 @llvm.amdgcn.perm(i32 421542475, i32 1550679695, i32 2155971203) + store volatile i32 %s1s2_0x80818283, i32* %p + %u1u2_0x80818283 = call i32 @llvm.amdgcn.perm(i32 undef, i32 undef, i32 2155971203) + store volatile i32 %u1u2_0x80818283, i32* %p + %u1u2_0x0e0e0e0e = call i32 @llvm.amdgcn.perm(i32 undef, i32 undef, i32 235802126) + store volatile i32 %u1u2_0x0e0e0e0e, i32* %p + %u1s2_0x07060504 = call i32 @llvm.amdgcn.perm(i32 undef, i32 1550679695, i32 117835012) + store volatile i32 %u1s2_0x07060504, i32* %p + %s1u2_0x07060504 = call i32 @llvm.amdgcn.perm(i32 421542475, i32 undef, i32 117835012) + store volatile i32 %s1u2_0x07060504, i32* %p + %u1s2_0x03020100 = call i32 @llvm.amdgcn.perm(i32 undef, i32 1550679695, i32 50462976) + store volatile i32 %u1s2_0x03020100, i32* %p + %s1u2_0x03020100 = call i32 @llvm.amdgcn.perm(i32 421542475, i32 undef, i32 50462976) + store volatile i32 %s1u2_0x03020100, i32* %p + %u1s2_0x07060500 = call i32 @llvm.amdgcn.perm(i32 undef, i32 1550679695, i32 117835008) + store volatile i32 %u1s2_0x07060500, i32* %p + %u1s2_0x0706050c = call i32 @llvm.amdgcn.perm(i32 undef, i32 1550679695, i32 117835020) + store volatile i32 %u1s2_0x0706050c, i32* %p + %u1s2_0x0706050d = call i32 @llvm.amdgcn.perm(i32 undef, i32 1550679695, i32 117835021) + store volatile i32 %u1s2_0x0706050d, i32* %p + %u1s2_0x03020104 = call i32 @llvm.amdgcn.perm(i32 undef, i32 1550679695, i32 50462980) + store volatile i32 %u1s2_0x03020104, i32* %p + %s1u2_0x03020104 = call i32 @llvm.amdgcn.perm(i32 421542475, i32 undef, i32 50462980) + store volatile i32 %s1u2_0x03020104, i32* %p + %s1u2_0x0302010c = call i32 @llvm.amdgcn.perm(i32 421542475, i32 undef, i32 50462988) + store volatile i32 %s1u2_0x0302010c, i32* %p + %s1u2_0x0302010e = call i32 @llvm.amdgcn.perm(i32 421542475, i32 undef, i32 50462990) + store volatile i32 %s1u2_0x0302010e, i32* %p + %s1u2_0x03020f0e = call i32 @llvm.amdgcn.perm(i32 421542475, i32 undef, i32 50466574) + store volatile i32 %s1u2_0x03020f0e, i32* %p + %s1u2_0x07060500 = call i32 @llvm.amdgcn.perm(i32 421542475, i32 undef, i32 117835008) + store volatile i32 %s1u2_0x07060500, i32* %p + %_0x81000100_0x01008100_0x0b0a0908 = call i32 @llvm.amdgcn.perm(i32 2164261120, i32 16810240, i32 185207048) + store volatile i32 %_0x81000100_0x01008100_0x0b0a0908, i32* %p + %_u1_0x01008100_0x0b0a0908 = call i32 @llvm.amdgcn.perm(i32 undef, i32 16810240, i32 185207048) + store volatile i32 %_u1_0x01008100_0x0b0a0908, i32* %p + %_0x81000100_u2_0x0b0a0908 = call i32 @llvm.amdgcn.perm(i32 2164261120, i32 undef, i32 185207048) + store volatile i32 %_0x81000100_u2_0x0b0a0908, i32* %p + ret void +}