Index: llvm/include/llvm/Analysis/ConstantFolding.h =================================================================== --- llvm/include/llvm/Analysis/ConstantFolding.h +++ llvm/include/llvm/Analysis/ConstantFolding.h @@ -170,6 +170,12 @@ /// represented, return null. Constant *ConstantFoldLoadFromUniformValue(Constant *C, Type *Ty); +/// If C is a constant patterned array and all valid loaded results for given +/// alignment are same to a constant, return that constant. +Constant *ConstantFoldLoadFromPatternedAggregate(Constant *C, Type *LoadTy, + uint64_t LoadAlign, + const DataLayout &DL); + /// canConstantFoldCallTo - Return true if its even possible to fold a call to /// the specified function. bool canConstantFoldCallTo(const CallBase *Call, const Function *F); Index: llvm/lib/Analysis/ConstantFolding.cpp =================================================================== --- llvm/lib/Analysis/ConstantFolding.cpp +++ llvm/lib/Analysis/ConstantFolding.cpp @@ -767,6 +767,70 @@ return nullptr; } +Constant *llvm::ConstantFoldLoadFromPatternedAggregate(Constant *C, + Type *LoadTy, + uint64_t LoadAlign, + const DataLayout &DL) { + + auto *CTy = C->getType(); + auto CSTy = dyn_cast(CTy); + auto CATy = dyn_cast(CTy); + // TODO: Current implementation can't handle unpacked struct and pointer + // elements + if ((CSTy && CSTy->isPacked()) || CATy) { + // Bail if some ptr type element exists. + if (CSTy) { + for (unsigned I = 0, NumElm = CSTy->getNumElements(); I < NumElm; I++) + if (isa(CSTy->getElementType(I))) + return nullptr; + } + if (CATy) { + if (isa(CATy->getElementType())) + return nullptr; + } + + uint64_t GVSize = DL.getTypeStoreSize(CTy); + + // Bail for large initializers in excess of 64K to avoid allocating + // too much memory. + if (UINT16_MAX < GVSize) + return nullptr; + + if (GVSize) { + uint64_t LoadSize = LoadTy->getScalarSizeInBits() / 8; + + // Read Global Variable bytes and check the results' equivalence from + // possible offsets for given load alignment + SmallVector RawBytes(static_cast(GVSize)); + unsigned char *GVBytes = RawBytes.data(); + ReadDataFromGlobal(C, 0, GVBytes, GVSize, DL); + for (uint64_t ByteOffset = LoadAlign; ByteOffset <= GVSize - LoadSize; + ByteOffset += LoadAlign) + for (uint64_t I = 0; I < LoadSize; I++) + if (GVBytes[I] != GVBytes[I + ByteOffset]) + return nullptr; + + // swap bytes if on big endian + if (!DL.isLittleEndian()) + for (unsigned I = 0; I < LoadSize / 2; I += 1) { + unsigned char T = GVBytes[I]; + GVBytes[I] = GVBytes[LoadSize - 1 - I]; + GVBytes[LoadSize - 1 - I] = T; + } + + // convert bytes to a Load type Constant + StringRef s = + StringRef(reinterpret_cast(GVBytes), LoadSize); + Constant *CDA = ConstantDataArray::getRaw(s, 1, LoadTy); + Constant *Res = CDA->getAggregateElement(0U); + + return Res; + } + } + + return nullptr; +} + namespace { /// One of Op0/Op1 is a constant expression. Index: llvm/lib/Analysis/InstructionSimplify.cpp =================================================================== --- llvm/lib/Analysis/InstructionSimplify.cpp +++ llvm/lib/Analysis/InstructionSimplify.cpp @@ -6634,16 +6634,21 @@ return ConstantFoldLoadFromConstPtr(PtrOpC, LI->getType(), Q.DL); // We can only fold the load if it is from a constant global with definitive - // initializer. Skip expensive logic if this is not the case. + // or unique initializer. Skip expensive logic if this is not the case. auto *GV = dyn_cast(getUnderlyingObject(PtrOp)); - if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer()) + if (!GV || !GV->isConstant() || + (!GV->hasDefinitiveInitializer() && !GV->hasUniqueInitializer())) return nullptr; - // If GlobalVariable's initializer is uniform, then return the constant - // regardless of its offset. + // If GlobalVariable's initializer is uniform or any valid load results of + // arrays/structs are equal to some constant , then return the constant regardless of its + // offset. if (Constant *C = ConstantFoldLoadFromUniformValue(GV->getInitializer(), LI->getType())) return C; + if (Constant *C = ConstantFoldLoadFromPatternedAggregate( + GV->getInitializer(), LI->getType(), LI->getAlign().value(), Q.DL)) + return C; // Try to convert operand into a constant by stripping offsets while looking // through invariant.group intrinsics. Index: llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll +++ llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll @@ -19,12 +19,11 @@ ; OPT: @llvm.amdgcn.kernel.k123.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k123.lds.t undef, align 8 ; OPT{LITERAL}: @llvm.amdgcn.lds.offset.table = internal addrspace(4) constant [2 x [1 x i32]] [[1 x i32] [i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds to i32)], [1 x i32] [i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds to i32)]] -;. define void @f0() { ; OPT-LABEL: @f0( -; OPT-NEXT: %ld = load float, ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds, align 4 -; OPT-NEXT: %mul = fmul float %ld, 2.000000e+00 -; OPT-NEXT: store float %mul, ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds, align 4 +; OPT-NEXT: [[LD:%.*]] = load float, ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds, align 4 +; OPT-NEXT: [[MUL:%.*]] = fmul float [[LD]], 2.000000e+00 +; OPT-NEXT: store float [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds, align 4 ; OPT-NEXT: ret void ; ; GCN-LABEL: f0: @@ -46,9 +45,9 @@ define void @f1() { ; OPT-LABEL: @f1( -; OPT-NEXT: %ld = load i16, ptr addrspace(3) @llvm.amdgcn.module.lds, align 16 -; OPT-NEXT: %mul = mul i16 %ld, 3 -; OPT-NEXT: store i16 %mul, ptr addrspace(3) @llvm.amdgcn.module.lds, align 16 +; OPT-NEXT: [[LD:%.*]] = load i16, ptr addrspace(3) @llvm.amdgcn.module.lds, align 16 +; OPT-NEXT: [[MUL:%.*]] = mul i16 [[LD]], 3 +; OPT-NEXT: store i16 [[MUL]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 16 ; OPT-NEXT: ret void ; ; GCN-LABEL: f1: @@ -70,32 +69,22 @@ define void @f2() { ; OPT-LABEL: @f2( -; OPT-NEXT: %1 = call i32 @llvm.amdgcn.lds.kernel.id() -; OPT-NEXT: %v22 = getelementptr inbounds [2 x [1 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 %1, i32 0 -; OPT-NEXT: %2 = load i32, ptr addrspace(4) %v22, align 4 -; OPT-NEXT: %v23 = inttoptr i32 %2 to ptr addrspace(3) -; OPT-NEXT: %ld = load i64, ptr addrspace(3) %v23, align 4 -; OPT-NEXT: %mul = mul i64 %ld, 4 -; OPT-NEXT: %v2 = getelementptr inbounds [2 x [1 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 %1, i32 0 -; OPT-NEXT: %3 = load i32, ptr addrspace(4) %v2, align 4 -; OPT-NEXT: %v21 = inttoptr i32 %3 to ptr addrspace(3) -; OPT-NEXT: store i64 %mul, ptr addrspace(3) %v21, align 4 +; OPT-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() +; OPT-NEXT: [[V22:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0 +; OPT-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[V22]], align 4 +; OPT-NEXT: [[V23:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3) +; OPT-NEXT: [[LD:%.*]] = load i64, ptr addrspace(3) [[V23]], align 4 +; OPT-NEXT: [[MUL:%.*]] = mul i64 [[LD]], 4 +; OPT-NEXT: [[V2:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0 +; OPT-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[V2]], align 4 +; OPT-NEXT: [[V21:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3) +; OPT-NEXT: store i64 [[MUL]], ptr addrspace(3) [[V21]], align 4 ; OPT-NEXT: ret void ; ; GCN-LABEL: f2: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, s15 -; GCN-NEXT: s_ashr_i32 s5, s15, 31 -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+12 -; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN-NEXT: s_add_u32 s4, s4, s6 -; GCN-NEXT: s_addc_u32 s5, s5, s7 -; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_read_b64 v[0:1], v2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -111,9 +100,9 @@ define void @f3() { ; OPT-LABEL: @f3( -; OPT-NEXT: %ld = load i8, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k23.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds, i32 0, i32 1), align 8 -; OPT-NEXT: %mul = mul i8 %ld, 5 -; OPT-NEXT: store i8 %mul, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k23.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds, i32 0, i32 1), align 8 +; OPT-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K23_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds, i32 0, i32 1), align 8 +; OPT-NEXT: [[MUL:%.*]] = mul i8 [[LD]], 5 +; OPT-NEXT: store i8 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K23_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds, i32 0, i32 1), align 8 ; OPT-NEXT: ret void ; ; GCN-LABEL: f3: @@ -136,9 +125,9 @@ ; Doesn't access any via a function, won't be in the lookup table define amdgpu_kernel void @kernel_no_table() { ; OPT-LABEL: @kernel_no_table( -; OPT-NEXT: %ld = load i64, ptr addrspace(3) @llvm.amdgcn.kernel.kernel_no_table.lds, align 8 -; OPT-NEXT: %mul = mul i64 %ld, 8 -; OPT-NEXT: store i64 %mul, ptr addrspace(3) @llvm.amdgcn.kernel.kernel_no_table.lds, align 8 +; OPT-NEXT: [[LD:%.*]] = load i64, ptr addrspace(3) @llvm.amdgcn.kernel.kernel_no_table.lds, align 8 +; OPT-NEXT: [[MUL:%.*]] = mul i64 [[LD]], 8 +; OPT-NEXT: store i64 [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.kernel_no_table.lds, align 8 ; OPT-NEXT: ret void ; ; GCN-LABEL: kernel_no_table: @@ -206,22 +195,20 @@ ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, f2@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, f2@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_getpc_b64 s[6:7] +; GCN-NEXT: s_add_u32 s6, s6, f2@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s7, s7, f2@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 ; GCN-NEXT: s_mov_b32 s15, 1 -; GCN-NEXT: s_mov_b64 s[6:7], s[8:9] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, f3@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, f3@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: s_mov_b64 s[6:7], s[8:9] +; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_getpc_b64 s[6:7] +; GCN-NEXT: s_add_u32 s6, s6, f3@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s7, s7, f3@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GCN-NEXT: s_endpgm call void @f2() call void @f3() @@ -234,9 +221,9 @@ ; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds) ] ; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] ; OPT-NEXT: call void @f1() -; OPT-NEXT: %ld = load i8, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k123.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope !2, !noalias !5 -; OPT-NEXT: %mul = mul i8 %ld, 8 -; OPT-NEXT: store i8 %mul, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k123.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope !2, !noalias !5 +; OPT-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope !2, !noalias !5 +; OPT-NEXT: [[MUL:%.*]] = mul i8 [[LD]], 8 +; OPT-NEXT: store i8 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope !2, !noalias !5 ; OPT-NEXT: call void @f2() ; OPT-NEXT: ret void ; @@ -248,13 +235,13 @@ ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_getpc_b64 s[6:7] +; GCN-NEXT: s_add_u32 s6, s6, f1@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s7, s7, f1@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, f1@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, f1@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_read_u8 v1, v0 offset:16 @@ -284,11 +271,9 @@ !2 = !{i32 1} -;. ; OPT: attributes #0 = { "amdgpu-elide-module-lds" } ; OPT: attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } ; OPT: attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -;. ; OPT: !0 = !{i32 1} ; OPT: !1 = !{i32 0} ; OPT: !2 = !{!3} @@ -296,7 +281,6 @@ ; OPT: !4 = distinct !{!4} ; OPT: !5 = !{!6} ; OPT: !6 = distinct !{!6, !4} -;. ; Table size length number-kernels * number-variables * sizeof(uint16_t) ; GCN: .type llvm.amdgcn.lds.offset.table,@object Index: llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll +++ llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll @@ -46,17 +46,7 @@ ; GCN-LABEL: f0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, s15 -; GCN-NEXT: s_ashr_i32 s5, s15, 31 -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+12 -; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 4 -; GCN-NEXT: s_add_u32 s4, s4, s6 -; GCN-NEXT: s_addc_u32 s5, s5, s7 -; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_read_b32 v1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -87,17 +77,7 @@ ; GCN-LABEL: f1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, s15 -; GCN-NEXT: s_ashr_i32 s5, s15, 31 -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+8 -; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+16 -; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 4 -; GCN-NEXT: s_add_u32 s4, s4, s6 -; GCN-NEXT: s_addc_u32 s5, s5, s7 -; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_read_u16 v1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -128,17 +108,7 @@ ; GCN-LABEL: f2: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, s15 -; GCN-NEXT: s_ashr_i32 s5, s15, 31 -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+12 -; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+20 -; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 4 -; GCN-NEXT: s_add_u32 s4, s4, s6 -; GCN-NEXT: s_addc_u32 s5, s5, s7 -; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_read_b64 v[0:1], v2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -169,17 +139,7 @@ ; GCN-LABEL: f3: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, s15 -; GCN-NEXT: s_ashr_i32 s5, s15, 31 -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+16 -; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+24 -; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 4 -; GCN-NEXT: s_add_u32 s4, s4, s6 -; GCN-NEXT: s_addc_u32 s5, s5, s7 -; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_read_u8 v1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -195,7 +155,7 @@ ; Doesn't access any via a function, won't be in the lookup table define amdgpu_kernel void @kernel_no_table() { -; OPT-LABEL: @kernel_no_table() { +; OPT-LABEL: @kernel_no_table( ; OPT-NEXT: [[LD:%.*]] = load i64, ptr addrspace(3) @llvm.amdgcn.kernel.kernel_no_table.lds, align 8 ; OPT-NEXT: [[MUL:%.*]] = mul i64 [[LD]], 8 ; OPT-NEXT: store i64 [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.kernel_no_table.lds, align 8 @@ -218,7 +178,7 @@ ; Access two variables, will allocate those two define amdgpu_kernel void @k01() { -; OPT-LABEL: @k01() !llvm.amdgcn.lds.kernel.id !0 { +; OPT-LABEL: @k01( ; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds) ] ; OPT-NEXT: call void @f0() ; OPT-NEXT: call void @f1() @@ -227,36 +187,32 @@ ; GCN-LABEL: k01: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 -; GCN-NEXT: s_add_i32 s6, s6, s9 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s5 +; GCN-NEXT: s_add_i32 s4, s4, s7 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; GCN-NEXT: s_add_u32 s0, s0, s7 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f0@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f0@gotpcrel32@hi+12 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_mov_b32 s15, 0 -; GCN-NEXT: s_mov_b64 s[6:7], s[8:9] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f1@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f1@gotpcrel32@hi+12 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: s_mov_b64 s[6:7], s[8:9] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: s_endpgm -; GCN: .amdhsa_group_segment_fixed_size 8 call void @f0() call void @f1() ret void } define amdgpu_kernel void @k23() { -; OPT-LABEL: @k23() !llvm.amdgcn.lds.kernel.id !1 { +; OPT-LABEL: @k23( ; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds) ] ; OPT-NEXT: call void @f2() ; OPT-NEXT: call void @f3() @@ -265,29 +221,25 @@ ; GCN-LABEL: k23: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 -; GCN-NEXT: s_add_i32 s6, s6, s9 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s5 +; GCN-NEXT: s_add_i32 s4, s4, s7 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; GCN-NEXT: s_add_u32 s0, s0, s7 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f2@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f2@gotpcrel32@hi+12 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_mov_b32 s15, 2 -; GCN-NEXT: s_mov_b64 s[6:7], s[8:9] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f3@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f3@gotpcrel32@hi+12 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: s_mov_b64 s[6:7], s[8:9] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: s_endpgm -; GCN: .amdhsa_group_segment_fixed_size 16 call void @f2() call void @f3() ret void @@ -295,7 +247,7 @@ ; Access and allocate three variables define amdgpu_kernel void @k123() { -; OPT-LABEL: @k123() !llvm.amdgcn.lds.kernel.id !2 { +; OPT-LABEL: @k123( ; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds) ] ; OPT-NEXT: call void @f1() ; OPT-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 2, !alias.scope !3, !noalias !6 @@ -307,18 +259,16 @@ ; GCN-LABEL: k123: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 -; GCN-NEXT: s_add_i32 s6, s6, s9 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s5 +; GCN-NEXT: s_add_i32 s4, s4, s7 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; GCN-NEXT: s_add_u32 s0, s0, s7 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f1@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f1@gotpcrel32@hi+12 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_mov_b32 s15, 1 -; GCN-NEXT: s_mov_b64 s[6:7], s[8:9] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0 @@ -331,10 +281,8 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GCN-NEXT: ds_write_b8 v0, v1 offset:2 -; GCN-NEXT: s_mov_b64 s[6:7], s[8:9] ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: s_endpgm -; GCN: .amdhsa_group_segment_fixed_size 16 call void @f1() %ld = load i8, ptr addrspace(3) @v3 %mul = mul i8 %ld, 8 Index: llvm/test/Transforms/InstSimplify/load-patterned-aggregates.ll =================================================================== --- llvm/test/Transforms/InstSimplify/load-patterned-aggregates.ll +++ llvm/test/Transforms/InstSimplify/load-patterned-aggregates.ll @@ -4,7 +4,7 @@ @constzeroarray = internal constant [4 x i32] zeroinitializer @constarray = internal constant [8 x i8] c"\01\00\01\00\01\00\01\00", align 4 -@conststruct = internal constant <{[8 x i8]}> <{[8 x i8] c"\01\00\01\00\01\00\01\00"}>, align 4 +@constpackedstruct = internal constant <{[8 x i8]}> <{[8 x i8] c"\01\00\01\00\01\00\01\00"}>, align 4 define i32 @load_gep_const_zero_array(i64 %idx) { ; CHECK-LABEL: @load_gep_const_zero_array( @@ -25,37 +25,9 @@ ret i8 %load } - -define i32 @load_gep_const_patterned_array(i64 %idx) { -; CHECK-LABEL: @load_gep_const_patterned_array( -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [4 x i32], ptr @constarray, i64 0, i64 [[IDX:%.*]] -; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[GEP]], align 4 -; CHECK-NEXT: ret i32 [[LOAD]] -; - %gep = getelementptr inbounds [4 x i32], ptr @constarray, i64 0, i64 %idx - %load = load i32, ptr %gep - ret i32 %load -} - -define i8 @load_i8_multi_gep_const_array(i64 %idx1, i64 %idx2) { -; CHECK-LABEL: @load_i8_multi_gep_const_array( -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, ptr @constarray, i64 [[IDX1:%.*]] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[GEP1]], i64 [[IDX2:%.*]] -; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[GEP]], align 1 -; CHECK-NEXT: ret i8 [[LOAD]] -; - %gep1 = getelementptr inbounds i8, ptr @constarray, i64 %idx1 - %gep = getelementptr inbounds i8, ptr %gep1, i64 %idx2 - %load = load i8, ptr %gep - ret i8 %load -} - -; TODO: this should be ret i8 1 define i8 @gep_load_i8_align2(i64 %idx){ ; CHECK-LABEL: @gep_load_i8_align2( -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr @constarray, i64 [[IDX:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr [[TMP1]], align 2 -; CHECK-NEXT: ret i8 [[TMP2]] +; CHECK-NEXT: ret i8 1 ; %1 = getelementptr inbounds i8, ptr @constarray, i64 %idx %2 = load i8, ptr %1, align 2 @@ -74,26 +46,20 @@ ret i8 %2 } -; TODO: this should be ret i8 65537 on the case for little endian define i32 @gep_i32_load_i32_align4(i64 %idx){ ; CHECK-LABEL: @gep_i32_load_i32_align4( -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr @constarray, i64 [[IDX:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 -; CHECK-NEXT: ret i32 [[TMP2]] +; CHECK-NEXT: ret i32 65537 ; %1 = getelementptr inbounds i32, ptr @constarray, i64 %idx %2 = load i32, ptr %1, align 4 ret i32 %2 } -; TODO: this should be ret i8 65537 on the case for little endian -define i32 @gep_i32_load_i32_align4_struct(i64 %idx){ -; CHECK-LABEL: @gep_i32_load_i32_align4_struct( -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr @conststruct, i64 [[IDX:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 -; CHECK-NEXT: ret i32 [[TMP2]] +define i32 @gep_i32_load_i32_align4_packedstruct(i64 %idx){ +; CHECK-LABEL: @gep_i32_load_i32_align4_packedstruct( +; CHECK-NEXT: ret i32 65537 ; - %1 = getelementptr inbounds i32, ptr @conststruct, i64 %idx + %1 = getelementptr inbounds i32, ptr @constpackedstruct, i64 %idx %2 = load i32, ptr %1, align 4 ret i32 %2 } @@ -111,16 +77,17 @@ } ; can't be folded -define i32 @gep_i8_load_i32_align1_struct(i64 %idx){ -; CHECK-LABEL: @gep_i8_load_i32_align1_struct( -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr @conststruct, i64 [[IDX:%.*]] +define i32 @gep_i8_load_i32_align1_packedstruct(i64 %idx){ +; CHECK-LABEL: @gep_i8_load_i32_align1_packedstruct( +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr @constpackedstruct, i64 [[IDX:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 1 ; CHECK-NEXT: ret i32 [[TMP2]] ; - %1 = getelementptr inbounds i8, ptr @conststruct, i64 %idx + %1 = getelementptr inbounds i8, ptr @constpackedstruct, i64 %idx %2 = load i32, ptr %1, align 1 ret i32 %2 } + ; TODO: This could be folded but need to see GEP source types define i32 @gep_i16_load_i32_align1(i64 %idx){ ; CHECK-LABEL: @gep_i16_load_i32_align1(