Index: include/polly/CodeGen/IslNodeBuilder.h =================================================================== --- include/polly/CodeGen/IslNodeBuilder.h +++ include/polly/CodeGen/IslNodeBuilder.h @@ -209,7 +209,7 @@ virtual void createFor(__isl_take isl_ast_node *For); /// @brief Set to remember materialized invariant loads. - SmallPtrSet PreloadedPtrs; + SmallSet, 16> PreloadedPtrs; /// @brief Preload the memory access at @p AccessRange with @p Build. /// Index: include/polly/ScopInfo.h =================================================================== --- include/polly/ScopInfo.h +++ include/polly/ScopInfo.h @@ -854,7 +854,7 @@ /// location is accessed, hence the union of all domain contexts for the memory /// accesses in the list. using InvariantEquivClassTy = - std::tuple; + std::tuple; /// @brief Type for invariant accesses equivalence classes. using InvariantEquivClassesTy = SmallVector; Index: lib/Analysis/ScopInfo.cpp =================================================================== --- lib/Analysis/ScopInfo.cpp +++ lib/Analysis/ScopInfo.cpp @@ -1827,21 +1827,22 @@ } void Scop::buildInvariantEquivalenceClasses() { - DenseMap EquivClasses; + DenseMap, LoadInst *> EquivClasses; const InvariantLoadsSetTy &RIL = *SD.getRequiredInvariantLoads(&getRegion()); for (LoadInst *LInst : RIL) { const SCEV *PointerSCEV = SE->getSCEV(LInst->getPointerOperand()); - LoadInst *&ClassRep = EquivClasses[PointerSCEV]; + Type *Ty = LInst->getType(); + LoadInst *&ClassRep = EquivClasses[std::make_pair(PointerSCEV, Ty)]; if (ClassRep) { InvEquivClassVMap[LInst] = ClassRep; continue; } ClassRep = LInst; - InvariantEquivClasses.emplace_back(PointerSCEV, MemoryAccessList(), - nullptr); + InvariantEquivClasses.emplace_back(PointerSCEV, MemoryAccessList(), nullptr, + Ty); } } @@ -2852,9 +2853,10 @@ if (Value *Rep = InvEquivClassVMap.lookup(LInst)) LInst = cast(Rep); + Type *Ty = LInst->getType(); const SCEV *PointerSCEV = SE->getSCEV(LInst->getPointerOperand()); for (auto &IAClass : InvariantEquivClasses) - if (PointerSCEV == std::get<0>(IAClass)) + if (PointerSCEV == std::get<0>(IAClass) && Ty == std::get<3>(IAClass)) return &IAClass; return nullptr; @@ -2897,11 +2899,12 @@ // MA and if found consolidate them. Otherwise create a new equivalence // class at the end of InvariantEquivClasses. LoadInst *LInst = cast(MA->getAccessInstruction()); + Type *Ty = LInst->getType(); const SCEV *PointerSCEV = SE->getSCEV(LInst->getPointerOperand()); bool Consolidated = false; for (auto &IAClass : InvariantEquivClasses) { - if (PointerSCEV != std::get<0>(IAClass)) + if (PointerSCEV != std::get<0>(IAClass) || Ty != std::get<3>(IAClass)) continue; Consolidated = true; @@ -2926,7 +2929,7 @@ // If we did not consolidate MA, thus did not find an equivalence class // for it, we create a new one. InvariantEquivClasses.emplace_back(PointerSCEV, MemoryAccessList{MA}, - isl_set_copy(DomainCtx)); + isl_set_copy(DomainCtx), Ty); } isl_set_free(DomainCtx); @@ -2971,15 +2974,6 @@ isl_map *AccessRelation = Access->getAccessRelation(); - // Invariant load hoisting of memory accesses with non-canonical element - // types lacks support for equivalence classes that contain elements of - // different width/size. Hence, do not yet consider loads with non-canonical - // element size for load hoisting. - if (!isl_map_is_single_valued(AccessRelation)) { - isl_map_free(AccessRelation); - return false; - } - // Skip accesses that have an empty access relation. These can be caused // by multiple offsets with a type cast in-between that cause the overall // byte offset to be not divisible by the new types sizes. Index: lib/CodeGen/IslNodeBuilder.cpp =================================================================== --- lib/CodeGen/IslNodeBuilder.cpp +++ lib/CodeGen/IslNodeBuilder.cpp @@ -1015,7 +1015,8 @@ // Check for recurrsion which can be caused by additional constraints, e.g., // non-finitie loop contraints. In such a case we have to bail out and insert // a "false" runtime check that will cause the original code to be executed. - if (!PreloadedPtrs.insert(std::get<0>(IAClass)).second) + auto PtrId = std::make_pair(std::get<0>(IAClass), std::get<3>(IAClass)); + if (!PreloadedPtrs.insert(PtrId).second) return false; // If the base pointer of this class is dependent on another one we have to @@ -1033,13 +1034,10 @@ if (!PreloadVal) return false; - assert(PreloadVal->getType() == AccInst->getType()); for (const MemoryAccess *MA : MAs) { Instruction *MAAccInst = MA->getAccessInstruction(); - // TODO: The bitcast here is wrong. In case of floating and non-floating - // point values we need to reload the value or convert it. - ValueMap[MAAccInst] = - Builder.CreateBitOrPointerCast(PreloadVal, MAAccInst->getType()); + assert(PreloadVal->getType() == MAAccInst->getType()); + ValueMap[MAAccInst] = PreloadVal; } if (SE.isSCEVable(AccInstTy)) { @@ -1063,11 +1061,8 @@ // should only change the base pointer of the derived SAI if we actually // preloaded it. if (BasePtr == MA->getBaseAddr()) { - // TODO: The bitcast here is wrong. In case of floating and non-floating - // point values we need to reload the value or convert it. - BasePtr = - Builder.CreateBitOrPointerCast(PreloadVal, BasePtr->getType()); - DerivedSAI->setBasePtr(BasePtr); + assert(BasePtr->getType() == PreloadVal->getType()); + DerivedSAI->setBasePtr(PreloadVal); } // For scalar derived SAIs we remap the alloca used for the derived value. Index: test/Isl/CodeGen/multiple-types-invariant-load.ll =================================================================== --- test/Isl/CodeGen/multiple-types-invariant-load.ll +++ test/Isl/CodeGen/multiple-types-invariant-load.ll @@ -1,18 +1,9 @@ -; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s - -; Invariant loads with non-canonical types are not yet fully supported. - -; XFAIL: * +; RUN: opt %loadPolly -polly-allow-differing-element-types -polly-codegen -S < %s | FileCheck %s ; CHECK: %polly.access.cast.global.load = bitcast %struct.hoge* %global.load to i32* ; CHECK: %polly.access.global.load = getelementptr i32, i32* %polly.access.cast.global.load, i64 0 ; CHECK: %polly.access.global.load.load = load i32, i32* %polly.access.global.load -; CHECK: %polly.access.cast.global.load1 = bitcast %struct.hoge* %global.load to i32* -; CHECK: %polly.access.global.load2 = getelementptr i32, i32* %polly.access.cast.global.load1, i64 2 -; CHECK: %polly.access.global.load2.cast = bitcast i32* %polly.access.global.load2 to double* -; CHECK: %polly.access.global.load2.load = load double, double* %polly.access.global.load2.cast - target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" Index: test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer.ll =================================================================== --- test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer.ll +++ test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer.ll @@ -11,6 +11,9 @@ ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] ; CHECK-NEXT: { Stmt_for_body[i0] -> MemRef_U[0] }; ; CHECK-NEXT: Execution Context: { : } +; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: { Stmt_for_body[i0] -> MemRef_U[0] }; +; CHECK-NEXT: Execution Context: { : } ; CHECK-NEXT: } ; ; CHECK: Statements { @@ -24,13 +27,15 @@ ; CHECK-NEXT: } ; ; CODEGEN: entry: -; CODEGEN: %U.f.preload.s2a = alloca float +; CODEGEN-DAG: %U.f.preload.s2a = alloca float +; CODEGEN-DAG: %U.i.preload.s2a = alloca i32 ; CODEGEN: br label %polly.split_new_and_old ; ; CODEGEN: polly.preload.begin: -; CODEGEN: %U.load = load float, float* bitcast (i32* @U to float*) -; CODEGEN: %0 = bitcast float %U.load to i32 -; CODEGEN: store float %U.load, float* %U.f.preload.s2a +; CODEGEN-DAG: %U.load[[f:[.0-9]*]] = load float, float* bitcast (i32* @U to float*) +; CODEGEN-DAG: store float %U.load[[f]], float* %U.f.preload.s2a +; CODEGEN-DAG: %U.load[[i:[.0-9]*]] = load i32, i32* @U +; CODEGEN-DAG: store i32 %U.load[[i]], i32* %U.i.preload.s2a ; ; CODEGEN: polly.merge_new_and_old: ; CODEGEN-NOT: merge = phi @@ -39,8 +44,7 @@ ; CODEGEN-NOT: final_reload ; ; CODEGEN: polly.stmt.for.body: -; CODEGEN: %p_conv = fptosi float %U.load to i32 -; CODEGEN: %p_add = add nsw i32 %0, %p_conv +; CODEGEN: %p_add = add nsw i32 %U.load[[i]], %p_conv ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" Index: test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer_escaping.ll =================================================================== --- test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer_escaping.ll +++ test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer_escaping.ll @@ -16,6 +16,9 @@ ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] ; CHECK-NEXT: { Stmt_do_body[i0] -> MemRef_U[0] }; ; CHECK-NEXT: Execution Context: { : } +; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: { Stmt_do_body[i0] -> MemRef_U[0] }; +; CHECK-NEXT: Execution Context: { : } ; CHECK-NEXT: } ; ; CHECK: Statements { @@ -29,26 +32,26 @@ ; CHECK-NEXT: } ; ; CODEGEN: entry: -; CODEGEN: %U.f.preload.s2a = alloca float +; CODEGEN-DAG: %U.f.preload.s2a = alloca float +; CODEGEN-DAG: %U.i.preload.s2a = alloca i32 ; CODEGEN: br label %polly.split_new_and_old ; ; CODEGEN: polly.preload.begin: -; CODEGEN: %U.load = load float, float* bitcast (i32* @U to float*) -; CODEGEN: %0 = bitcast float %U.load to i32 -; CODEGEN: store float %U.load, float* %U.f.preload.s2a +; CODEGEN-DAG: %U.load[[f:[.0-9]*]] = load float, float* bitcast (i32* @U to float*) +; CODEGEN-DAG: store float %U.load[[f]], float* %U.f.preload.s2a +; CODEGEN-DAG: %U.load[[i:[.0-9]*]] = load i32, i32* @U +; CODEGEN-DAG: store i32 %U.load[[i]], i32* %U.i.preload.s2a ; ; CODEGEN: polly.merge_new_and_old: ; CODEGEN-DAG: %U.f.merge = phi float [ %U.f.final_reload, %polly.exiting ], [ %U.f, %do.cond ] -; CODEGEN-DAG: %U.i.merge = phi i32 [ %5, %polly.exiting ], [ %U.i, %do.cond ] +; CODEGEN-DAG: %U.i.merge = phi i32 [ %U.i.final_reload, %polly.exiting ], [ %U.i, %do.cond ] ; ; CODEGEN: polly.loop_exit: ; CODEGEN-DAG: %U.f.final_reload = load float, float* %U.f.preload.s2a -; CODEGEN-DAG: %U.i.final_reload = load float, float* %U.f.preload.s2a -; CODEGEN-DAG: %5 = bitcast float %U.i.final_reload to i32 +; CODEGEN-DAG: %U.i.final_reload = load i32, i32* %U.i.preload.s2a ; ; CODEGEN: polly.stmt.do.body: -; CODEGEN: %p_conv = fptosi float %U.load to i32 -; CODEGEN: %p_add = add nsw i32 %0, %p_conv +; CODEGEN: %p_add = add nsw i32 %U.load[[i]], %p_conv ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"