Index: include/polly/CodeGen/PerfMonitor.h =================================================================== --- include/polly/CodeGen/PerfMonitor.h +++ include/polly/CodeGen/PerfMonitor.h @@ -71,11 +71,6 @@ /// The value of the cycle counter at the beginning of the last scop. llvm::Value *CyclesInScopStartPtr; - /// A memory location which serves as argument of the RDTSCP function. - /// - /// The value written to this location is currently not used. - llvm::Value *RDTSCPWriteLocation; - /// A global variable, that keeps track if the performance monitor /// initialization has already been run. llvm::Value *AlreadyInitializedPtr; @@ -106,7 +101,7 @@ /// this scop runs. void addScopCounter(); - /// Get a reference to the intrinsic "i64 @llvm.x86.rdtscp(i8*)". + /// Get a reference to the intrinsic "{ i64, i32 } @llvm.x86.rdtscp()". /// /// The rdtscp function returns the current value of the processor's /// time-stamp counter as well as the current CPU identifier. On modern x86 Index: lib/CodeGen/PerfMonitor.cpp =================================================================== --- lib/CodeGen/PerfMonitor.cpp +++ lib/CodeGen/PerfMonitor.cpp @@ -113,9 +113,6 @@ TryRegisterGlobal(M, "__polly_perf_cycles_in_scop_start", Builder.getInt64(0), &CyclesInScopStartPtr); - - TryRegisterGlobal(M, "__polly_perf_write_loation", Builder.getInt32(0), - &RDTSCPWriteLocation); } static const char *InitFunctionName = "__polly_perf_init"; @@ -142,9 +139,9 @@ // Measure current cycles and compute final timings. Function *RDTSCPFn = getRDTSCP(); - Value *CurrentCycles = Builder.CreateCall( - RDTSCPFn, - Builder.CreatePointerCast(RDTSCPWriteLocation, Builder.getInt8PtrTy())); + + Value *CurrentCycles = + Builder.CreateExtractValue(Builder.CreateCall(RDTSCPFn), {0}); Value *CyclesStart = Builder.CreateLoad(CyclesTotalStartPtr, true); Value *CyclesTotal = Builder.CreateSub(CurrentCycles, CyclesStart); Value *CyclesInScops = Builder.CreateLoad(CyclesInScopsPtr, true); @@ -255,9 +252,8 @@ if (Supported) { // Read the currently cycle counter and store the result for later. Function *RDTSCPFn = getRDTSCP(); - Value *CurrentCycles = Builder.CreateCall( - RDTSCPFn, - Builder.CreatePointerCast(RDTSCPWriteLocation, Builder.getInt8PtrTy())); + Value *CurrentCycles = + Builder.CreateExtractValue(Builder.CreateCall(RDTSCPFn), {0}); Builder.CreateStore(CurrentCycles, CyclesTotalStartPtr, true); } Builder.CreateRetVoid(); @@ -271,9 +267,8 @@ Builder.SetInsertPoint(InsertBefore); Function *RDTSCPFn = getRDTSCP(); - Value *CurrentCycles = Builder.CreateCall( - RDTSCPFn, - Builder.CreatePointerCast(RDTSCPWriteLocation, Builder.getInt8PtrTy())); + Value *CurrentCycles = + Builder.CreateExtractValue(Builder.CreateCall(RDTSCPFn), {0}); Builder.CreateStore(CurrentCycles, CyclesInScopStartPtr, true); } @@ -284,9 +279,8 @@ Builder.SetInsertPoint(InsertBefore); Function *RDTSCPFn = getRDTSCP(); LoadInst *CyclesStart = Builder.CreateLoad(CyclesInScopStartPtr, true); - Value *CurrentCycles = Builder.CreateCall( - RDTSCPFn, - Builder.CreatePointerCast(RDTSCPWriteLocation, Builder.getInt8PtrTy())); + Value *CurrentCycles = + Builder.CreateExtractValue(Builder.CreateCall(RDTSCPFn), {0}); Value *CyclesInScop = Builder.CreateSub(CurrentCycles, CyclesStart); Value *CyclesInScops = Builder.CreateLoad(CyclesInScopsPtr, true); CyclesInScops = Builder.CreateAdd(CyclesInScops, CyclesInScop); Index: test/Isl/CodeGen/perf_monitoring.ll =================================================================== --- test/Isl/CodeGen/perf_monitoring.ll +++ test/Isl/CodeGen/perf_monitoring.ll @@ -36,35 +36,37 @@ ; CHECK-NEXT: @__polly_perf_initialized = weak thread_local(initialexec) constant i1 false ; CHECK-NEXT: @__polly_perf_cycles_in_scops = weak thread_local(initialexec) constant i64 0 ; CHECK-NEXT: @__polly_perf_cycles_in_scop_start = weak thread_local(initialexec) constant i64 0 -; CHECK-NEXT: @__polly_perf_write_loation = weak thread_local(initialexec) constant i32 0 ; CHECK: polly.split_new_and_old: ; preds = %entry -; CHECK-NEXT: %0 = call i64 @llvm.x86.rdtscp(i8* bitcast (i32* @__polly_perf_write_loation to i8*)) -; CHECK-NEXT: store volatile i64 %0, i64* @__polly_perf_cycles_in_scop_start +; CHECK-NEXT: %0 = call { i64, i32 } @llvm.x86.rdtscp() +; CHECK-NEXT: %1 = extractvalue { i64, i32 } %0, 0 +; CHECK-NEXT: store volatile i64 %1, i64* @__polly_perf_cycles_in_scop_start ; CHECK: polly.merge_new_and_old: ; preds = %polly.exiting, %return.region_exiting -; CHECK-NEXT: %5 = load volatile i64, i64* @__polly_perf_cycles_in_scop_start -; CHECK-NEXT: %6 = call i64 @llvm.x86.rdtscp(i8* bitcast (i32* @__polly_perf_write_loation to i8*)) -; CHECK-NEXT: %7 = sub i64 %6, %5 -; CHECK-NEXT: %8 = load volatile i64, i64* @__polly_perf_cycles_in_scops -; CHECK-NEXT: %9 = add i64 %8, %7 -; CHECK-NEXT: store volatile i64 %9, i64* @__polly_perf_cycles_in_scops +; CHECK-NEXT: %6 = load volatile i64, i64* @__polly_perf_cycles_in_scop_start +; CHECK-NEXT: %7 = call { i64, i32 } @llvm.x86.rdtscp() +; CHECK-NEXT: %8 = extractvalue { i64, i32 } %7, 0 +; CHECK-NEXT: %9 = sub i64 %8, %6 +; CHECK-NEXT: %10 = load volatile i64, i64* @__polly_perf_cycles_in_scops +; CHECK-NEXT: %11 = add i64 %10, %9 +; CHECK-NEXT: store volatile i64 %11, i64* @__polly_perf_cycles_in_scops ; CHECK: define weak_odr void @__polly_perf_final() { ; CHECK-NEXT: start: -; CHECK-NEXT: %0 = call i64 @llvm.x86.rdtscp(i8* bitcast (i32* @__polly_perf_write_loation to i8*)) -; CHECK-NEXT: %1 = load volatile i64, i64* @__polly_perf_cycles_total_start -; CHECK-NEXT: %2 = sub i64 %0, %1 -; CHECK-NEXT: %3 = load volatile i64, i64* @__polly_perf_cycles_in_scops -; CHECK-NEXT: %4 = call i32 (...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @1, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([27 x i8], [27 x i8] addrspace(4)* @0, i32 0, i32 0)) -; CHECK-NEXT: %5 = call i32 @fflush(i8* null) -; CHECK-NEXT: %6 = call i32 (...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @3, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([27 x i8], [27 x i8] addrspace(4)* @2, i32 0, i32 0)) -; CHECK-NEXT: %7 = call i32 @fflush(i8* null) -; CHECK-NEXT: %8 = call i32 (...) @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @6, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(4)* @4, i32 0, i32 0), i64 %2, i8 addrspace(4)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(4)* @5, i32 0, i32 0)) -; CHECK-NEXT: %9 = call i32 @fflush(i8* null) -; CHECK-NEXT: %10 = call i32 (...) @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @9, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(4)* @7, i32 0, i32 0), i64 %3, i8 addrspace(4)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(4)* @8, i32 0, i32 0)) -; CHECK-NEXT: %11 = call i32 @fflush(i8* null) +; CHECK-NEXT: %0 = call { i64, i32 } @llvm.x86.rdtscp() +; CHECK-NEXT: %1 = extractvalue { i64, i32 } %0, 0 +; CHECK-NEXT: %2 = load volatile i64, i64* @__polly_perf_cycles_total_start +; CHECK-NEXT: %3 = sub i64 %1, %2 +; CHECK-NEXT: %4 = load volatile i64, i64* @__polly_perf_cycles_in_scops +; CHECK-NEXT: %5 = call i32 (...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @1, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([27 x i8], [27 x i8] addrspace(4)* @0, i32 0, i32 0)) +; CHECK-NEXT: %6 = call i32 @fflush(i8* null) +; CHECK-NEXT: %7 = call i32 (...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @3, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([27 x i8], [27 x i8] addrspace(4)* @2, i32 0, i32 0)) +; CHECK-NEXT: %8 = call i32 @fflush(i8* null) +; CHECK-NEXT: %9 = call i32 (...) @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @6, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(4)* @4, i32 0, i32 0), i64 %3, i8 addrspace(4)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(4)* @5, i32 0, i32 0)) +; CHECK-NEXT: %10 = call i32 @fflush(i8* null) +; CHECK-NEXT: %11 = call i32 (...) @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @9, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(4)* @7, i32 0, i32 0), i64 %4, i8 addrspace(4)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(4)* @8, i32 0, i32 0)) +; CHECK-NEXT: %12 = call i32 @fflush(i8* null) ; CHECK: define weak_odr void @__polly_perf_init() { @@ -78,7 +80,8 @@ ; CHECK: initbb: ; preds = %start ; CHECK-NEXT: store i1 true, i1* @__polly_perf_initialized ; CHECK-NEXT: %1 = call i32 @atexit(i8* bitcast (void ()* @__polly_perf_final to i8*)) -; CHECK-NEXT: %2 = call i64 @llvm.x86.rdtscp(i8* bitcast (i32* @__polly_perf_write_loation to i8*)) -; CHECK-NEXT: store volatile i64 %2, i64* @__polly_perf_cycles_total_start +; CHECK-NEXT: %2 = call { i64, i32 } @llvm.x86.rdtscp() +; CHECK-NEXT: %3 = extractvalue { i64, i32 } %2, 0 +; CHECK-NEXT: store volatile i64 %3, i64* @__polly_perf_cycles_total_start ; CHECK-NEXT: ret void ; CHECK-NEXT: } Index: test/Isl/CodeGen/perf_monitoring_cycles_per_scop.ll =================================================================== --- test/Isl/CodeGen/perf_monitoring_cycles_per_scop.ll +++ test/Isl/CodeGen/perf_monitoring_cycles_per_scop.ll @@ -65,11 +65,11 @@ ; @"__polly_perf_in_g_from__%next__to__%polly.merge_new_and_old_cycles" = weak thread_local(initialexec) constant i64 0 ; Bumping up number of cycles in f -; CHECK: %10 = load volatile i64, i64* @"__polly_perf_in_f_from__%next__to__%polly.merge_new_and_old_cycles" -; CHECK-NEXT: %11 = add i64 %10, %7 -; CHECK-NEXT: store volatile i64 %11, i64* @"__polly_perf_in_f_from__%next__to__%polly.merge_new_and_old_cycles" +; CHECK: %12 = load volatile i64, i64* @"__polly_perf_in_f_from__%next__to__%polly.merge_new_and_old_cycles" +; CHECK-NEXT: %13 = add i64 %12, %9 +; CHECK-NEXT: store volatile i64 %13, i64* @"__polly_perf_in_f_from__%next__to__%polly.merge_new_and_old_cycles" ; Bumping up number of cycles in g -; CHECK: %10 = load volatile i64, i64* @"__polly_perf_in_g_from__%next__to__%polly.merge_new_and_old_cycles" -; CHECK-NEXT: %11 = add i64 %10, %7 -; CHECK-NEXT: store volatile i64 %11, i64* @"__polly_perf_in_g_from__%next__to__%polly.merge_new_and_old_cycles" +; CHECK: %12 = load volatile i64, i64* @"__polly_perf_in_g_from__%next__to__%polly.merge_new_and_old_cycles" +; CHECK-NEXT: %13 = add i64 %12, %9 +; CHECK-NEXT: store volatile i64 %13, i64* @"__polly_perf_in_g_from__%next__to__%polly.merge_new_and_old_cycles" Index: test/Isl/CodeGen/perf_monitoring_trip_counts_per_scop.ll =================================================================== --- test/Isl/CodeGen/perf_monitoring_trip_counts_per_scop.ll +++ test/Isl/CodeGen/perf_monitoring_trip_counts_per_scop.ll @@ -65,11 +65,11 @@ ; CHECK: @"__polly_perf_in_g_from__%next__to__%polly.merge_new_and_old_trip_count" = weak thread_local(initialexec) constant i64 0 ; Bumping up number of cycles in f -; CHECK: %12 = load volatile i64, i64* @"__polly_perf_in_f_from__%next__to__%polly.merge_new_and_old_trip_count" -; CHECK-NEXT: %13 = add i64 %12, 1 -; CHECK-NEXT: store volatile i64 %13, i64* @"__polly_perf_in_f_from__%next__to__%polly.merge_new_and_old_trip_count" +; CHECK: %14 = load volatile i64, i64* @"__polly_perf_in_f_from__%next__to__%polly.merge_new_and_old_trip_count" +; CHECK-NEXT: %15 = add i64 %14, 1 +; CHECK-NEXT: store volatile i64 %15, i64* @"__polly_perf_in_f_from__%next__to__%polly.merge_new_and_old_trip_count" ; Bumping up number of cycles in g -; CHECK: %12 = load volatile i64, i64* @"__polly_perf_in_g_from__%next__to__%polly.merge_new_and_old_trip_count" -; CHECK-NEXT: %13 = add i64 %12, 1 -; CHECK-NEXT: store volatile i64 %13, i64* @"__polly_perf_in_g_from__%next__to__%polly.merge_new_and_old_trip_count" +; CHECK: %14 = load volatile i64, i64* @"__polly_perf_in_g_from__%next__to__%polly.merge_new_and_old_trip_count" +; CHECK-NEXT: %15 = add i64 %14, 1 +; CHECK-NEXT: store volatile i64 %15, i64* @"__polly_perf_in_g_from__%next__to__%polly.merge_new_and_old_trip_count"