; ModuleID = 'kernel_0' source_filename = "kernel_0" target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" target triple = "nvptx64-nvidia-cuda" define ptx_kernel void @kernel_0(i8 addrspace(1)* %MemRef0, i8 addrspace(1)* %MemRef1, i8 addrspace(1)* %MemRef2, i64 %p_0, i64 %p_1, i64 %p_2, i64 %p_3, i64, i64) #0 { entry: %2 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() %b0 = zext i32 %2 to i64 %3 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() %b1 = zext i32 %3 to i64 %4 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() %t0 = zext i32 %4 to i64 %5 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() %t1 = zext i32 %5 to i64 %6 = mul nsw i64 32, %b0 %7 = sub nsw i64 %p_2, %6 %8 = sub nsw i64 %7, 1 %pexp.p_div_q = udiv i64 %8, 8192 br label %polly.loop_preheader polly.loop_exit: ; preds = %polly.loop_exit4 ret void polly.loop_header: ; preds = %polly.loop_exit4, %polly.loop_preheader %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_exit4 ] %9 = mul nsw i64 32, %b1 %10 = sub nsw i64 %p_0, %9 %11 = sub nsw i64 %10, 1 %pexp.p_div_q1 = udiv i64 %11, 8192 br label %polly.loop_preheader3 polly.loop_exit4: ; preds = %polly.merge61 %polly.indvar_next = add nsw i64 %polly.indvar, 1 %polly.adjust_ub = sub i64 %pexp.p_div_q, 1 %polly.loop_cond = icmp sle i64 %polly.indvar, %polly.adjust_ub br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit polly.loop_preheader: ; preds = %entry %12 = icmp sgt i64 %p_0, 0 %smax = select i1 %12, i64 %p_0, i64 0 %13 = add i64 %smax, 1 %14 = icmp sgt i64 %p_0, 0 %smax58 = select i1 %14, i64 %p_0, i64 0 %15 = add i64 %smax58, 1 %16 = icmp sgt i64 %p_0, 0 %smax89 = select i1 %16, i64 %p_0, i64 0 %17 = add i64 %smax89, 1 br label %polly.loop_header polly.loop_header2: ; preds = %polly.merge61, %polly.loop_preheader3 %polly.indvar5 = phi i64 [ 0, %polly.loop_preheader3 ], [ %polly.indvar_next6, %polly.merge61 ] %18 = sub nsw i64 %p_1, 1 %polly.fdiv_q.shr = ashr i64 %18, 5 br label %polly.loop_if polly.loop_exit11: ; preds = %polly.merge, %polly.loop_if br label %polly.cond60 polly.cond60: ; preds = %polly.loop_exit11 %19 = icmp sle i64 %p_1, 0 br i1 %19, label %polly.then62, label %polly.else63 polly.merge61: ; preds = %polly.else63, %polly.merge65 %polly.indvar_next6 = add nsw i64 %polly.indvar5, 1 %polly.adjust_ub7 = sub i64 %pexp.p_div_q1, 1 %polly.loop_cond8 = icmp sle i64 %polly.indvar5, %polly.adjust_ub7 br i1 %polly.loop_cond8, label %polly.loop_header2, label %polly.loop_exit4 polly.loop_preheader3: ; preds = %polly.loop_header br label %polly.loop_header2 polly.loop_if: ; preds = %polly.loop_header2 %polly.loop_guard = icmp sle i64 0, %polly.fdiv_q.shr br i1 %polly.loop_guard, label %polly.loop_preheader10, label %polly.loop_exit11 polly.loop_header9: ; preds = %polly.merge, %polly.loop_preheader10 %polly.indvar12 = phi i64 [ 0, %polly.loop_preheader10 ], [ %polly.indvar_next13, %polly.merge ] br label %polly.cond polly.cond: ; preds = %polly.loop_header9 %20 = mul nsw i64 32, %b0 %21 = add nsw i64 %20, %t0 %22 = mul nsw i64 8192, %polly.indvar %23 = add nsw i64 %21, %22 %24 = add nsw i64 %23, 1 %25 = icmp sge i64 %p_2, %24 br i1 %25, label %polly.then, label %polly.else polly.merge: ; preds = %polly.else, %polly.loop_exit20 call void @llvm.nvvm.barrier0() %polly.indvar_next13 = add nsw i64 %polly.indvar12, 1 %polly.adjust_ub14 = sub i64 %polly.fdiv_q.shr, 1 %polly.loop_cond15 = icmp sle i64 %polly.indvar12, %polly.adjust_ub14 br i1 %polly.loop_cond15, label %polly.loop_header9, label %polly.loop_exit11 polly.loop_preheader10: ; preds = %polly.loop_if br label %polly.loop_header9 polly.then: ; preds = %polly.cond %26 = mul nsw i64 -2, %b1 %27 = mul nsw i64 512, %polly.indvar5 %28 = sub nsw i64 %26, %27 %29 = sub nsw i64 %p_0, %t1 %30 = sub nsw i64 %29, 1 %polly.fdiv_q.shr16 = ashr i64 %30, 4 %31 = add nsw i64 %28, %polly.fdiv_q.shr16 %32 = icmp slt i64 1, %31 %33 = select i1 %32, i64 1, i64 %31 br label %polly.loop_if17 polly.loop_exit20: ; preds = %polly.loop_exit38, %polly.loop_if17 br label %polly.merge polly.else: ; preds = %polly.cond br label %polly.merge polly.loop_if17: ; preds = %polly.then %polly.loop_guard21 = icmp sle i64 0, %33 br i1 %polly.loop_guard21, label %polly.loop_preheader19, label %polly.loop_exit20 polly.loop_header18: ; preds = %polly.loop_exit38, %polly.loop_preheader19 %polly.indvar22 = phi i64 [ 0, %polly.loop_preheader19 ], [ %polly.indvar_next23, %polly.loop_exit38 ] br label %polly.cond26 polly.cond26: ; preds = %polly.loop_header18 %34 = icmp eq i64 %polly.indvar12, 0 br i1 %34, label %polly.then28, label %polly.else29 polly.merge27: ; preds = %polly.else29, %polly.stmt.if7 %35 = mul nsw i64 32, %polly.indvar12 %36 = sub nsw i64 %p_1, %35 %37 = sub nsw i64 %36, 1 %38 = icmp slt i64 31, %37 %39 = select i1 %38, i64 31, i64 %37 br label %polly.loop_if35 polly.loop_exit38: ; preds = %polly.stmt.if9, %polly.loop_if35 %polly.indvar_next23 = add nsw i64 %polly.indvar22, 1 %polly.adjust_ub24 = sub i64 %33, 1 %polly.loop_cond25 = icmp sle i64 %polly.indvar22, %polly.adjust_ub24 br i1 %polly.loop_cond25, label %polly.loop_header18, label %polly.loop_exit20 polly.loop_preheader19: ; preds = %polly.loop_if17 br label %polly.loop_header18 polly.then28: ; preds = %polly.cond26 %40 = mul nsw i64 32, %b0 %41 = add nsw i64 %40, %t0 %42 = mul nsw i64 8192, %polly.indvar %43 = add nsw i64 %41, %42 %44 = mul nsw i64 32, %b1 %45 = add nsw i64 %44, %t1 %46 = mul nsw i64 8192, %polly.indvar5 %47 = add nsw i64 %45, %46 %48 = mul nsw i64 16, %polly.indvar22 %49 = add nsw i64 %47, %48 br label %polly.stmt.if7 polly.stmt.if7: ; preds = %polly.then28 %polly.access.cast.MemRef0 = bitcast i8 addrspace(1)* %MemRef0 to i64 addrspace(1)* %50 = mul nsw i64 32, %b1 %51 = add nsw i64 %50, %t1 %52 = mul nsw i64 8192, %polly.indvar5 %53 = add nsw i64 %51, %52 %54 = mul nsw i64 16, %polly.indvar22 %55 = add nsw i64 %53, %54 %polly.access.mul.MemRef0 = mul nsw i64 %55, %p_3 %56 = mul nsw i64 32, %b0 %57 = add nsw i64 %56, %t0 %58 = mul nsw i64 8192, %polly.indvar %59 = add nsw i64 %57, %58 %polly.access.add.MemRef0 = add nsw i64 %polly.access.mul.MemRef0, %59 %polly.access.MemRef0 = getelementptr i64, i64 addrspace(1)* %polly.access.cast.MemRef0, i64 %polly.access.add.MemRef0 %_p_scalar_ = load i64, i64 addrspace(1)* %polly.access.MemRef0, align 8 %p_ = mul i64 %_p_scalar_, %0, !dbg !1 %polly.access.cast.MemRef030 = bitcast i8 addrspace(1)* %MemRef0 to i64 addrspace(1)* %60 = mul nsw i64 32, %b1 %61 = add nsw i64 %60, %t1 %62 = mul nsw i64 8192, %polly.indvar5 %63 = add nsw i64 %61, %62 %64 = mul nsw i64 16, %polly.indvar22 %65 = add nsw i64 %63, %64 %polly.access.mul.MemRef031 = mul nsw i64 %65, %p_3 %66 = mul nsw i64 32, %b0 %67 = add nsw i64 %66, %t0 %68 = mul nsw i64 8192, %polly.indvar %69 = add nsw i64 %67, %68 %polly.access.add.MemRef032 = add nsw i64 %polly.access.mul.MemRef031, %69 %polly.access.MemRef033 = getelementptr i64, i64 addrspace(1)* %polly.access.cast.MemRef030, i64 %polly.access.add.MemRef032 store i64 %p_, i64 addrspace(1)* %polly.access.MemRef033, align 8 br label %polly.merge27 polly.else29: ; preds = %polly.cond26 br label %polly.merge27 polly.loop_if35: ; preds = %polly.merge27 %polly.loop_guard39 = icmp sle i64 0, %39 br i1 %polly.loop_guard39, label %polly.loop_preheader37, label %polly.loop_exit38 polly.loop_header36: ; preds = %polly.stmt.if9, %polly.loop_preheader37 %polly.indvar40 = phi i64 [ 0, %polly.loop_preheader37 ], [ %polly.indvar_next41, %polly.stmt.if9 ] %70 = mul nsw i64 32, %b0 %71 = add nsw i64 %70, %t0 %72 = mul nsw i64 8192, %polly.indvar %73 = add nsw i64 %71, %72 %74 = mul nsw i64 32, %polly.indvar12 %75 = add nsw i64 %74, %polly.indvar40 %76 = mul nsw i64 32, %b1 %77 = add nsw i64 %76, %t1 %78 = mul nsw i64 8192, %polly.indvar5 %79 = add nsw i64 %77, %78 %80 = mul nsw i64 16, %polly.indvar22 %81 = add nsw i64 %79, %80 br label %polly.stmt.if9 polly.stmt.if9: ; preds = %polly.loop_header36 %polly.access.cast.MemRef044 = bitcast i8 addrspace(1)* %MemRef0 to i64 addrspace(1)* %82 = mul nsw i64 32, %b1 %83 = add nsw i64 %82, %t1 %84 = mul nsw i64 8192, %polly.indvar5 %85 = add nsw i64 %83, %84 %86 = mul nsw i64 16, %polly.indvar22 %87 = add nsw i64 %85, %86 %polly.access.mul.MemRef045 = mul nsw i64 %87, %p_3 %88 = mul nsw i64 32, %b0 %89 = add nsw i64 %88, %t0 %90 = mul nsw i64 8192, %polly.indvar %91 = add nsw i64 %89, %90 %polly.access.add.MemRef046 = add nsw i64 %polly.access.mul.MemRef045, %91 %polly.access.MemRef047 = getelementptr i64, i64 addrspace(1)* %polly.access.cast.MemRef044, i64 %polly.access.add.MemRef046 %_p_scalar_48 = load i64, i64 addrspace(1)* %polly.access.MemRef047, align 8 %polly.access.cast.MemRef1 = bitcast i8 addrspace(1)* %MemRef1 to i64 addrspace(1)* %92 = mul nsw i64 32, %polly.indvar12 %93 = add nsw i64 %92, %polly.indvar40 %polly.access.mul.MemRef1 = mul nsw i64 %93, %p_2 %94 = mul nsw i64 32, %b0 %95 = add nsw i64 %94, %t0 %96 = mul nsw i64 8192, %polly.indvar %97 = add nsw i64 %95, %96 %polly.access.add.MemRef1 = add nsw i64 %polly.access.mul.MemRef1, %97 %polly.access.MemRef1 = getelementptr i64, i64 addrspace(1)* %polly.access.cast.MemRef1, i64 %polly.access.add.MemRef1 %_p_scalar_49 = load i64, i64 addrspace(1)* %polly.access.MemRef1, align 8 %p_50 = mul i64 %_p_scalar_49, %1, !dbg !27 %polly.access.cast.MemRef2 = bitcast i8 addrspace(1)* %MemRef2 to i64 addrspace(1)* %98 = mul nsw i64 32, %b1 %99 = add nsw i64 %98, %t1 %100 = mul nsw i64 8192, %polly.indvar5 %101 = add nsw i64 %99, %100 %102 = mul nsw i64 16, %polly.indvar22 %103 = add nsw i64 %101, %102 %polly.access.mul.MemRef2 = mul nsw i64 %103, %p_1 %104 = mul nsw i64 32, %polly.indvar12 %105 = add nsw i64 %104, %polly.indvar40 %polly.access.add.MemRef2 = add nsw i64 %polly.access.mul.MemRef2, %105 %polly.access.MemRef2 = getelementptr i64, i64 addrspace(1)* %polly.access.cast.MemRef2, i64 %polly.access.add.MemRef2 %_p_scalar_51 = load i64, i64 addrspace(1)* %polly.access.MemRef2, align 8 %p_52 = mul i64 %p_50, %_p_scalar_51, !dbg !27 %p_53 = add i64 %p_52, %_p_scalar_48, !dbg !27 %polly.access.cast.MemRef054 = bitcast i8 addrspace(1)* %MemRef0 to i64 addrspace(1)* %106 = mul nsw i64 32, %b1 %107 = add nsw i64 %106, %t1 %108 = mul nsw i64 8192, %polly.indvar5 %109 = add nsw i64 %107, %108 %110 = mul nsw i64 16, %polly.indvar22 %111 = add nsw i64 %109, %110 %polly.access.mul.MemRef055 = mul nsw i64 %111, %p_3 %112 = mul nsw i64 32, %b0 %113 = add nsw i64 %112, %t0 %114 = mul nsw i64 8192, %polly.indvar %115 = add nsw i64 %113, %114 %polly.access.add.MemRef056 = add nsw i64 %polly.access.mul.MemRef055, %115 %polly.access.MemRef057 = getelementptr i64, i64 addrspace(1)* %polly.access.cast.MemRef054, i64 %polly.access.add.MemRef056 store i64 %p_53, i64 addrspace(1)* %polly.access.MemRef057, align 8 %polly.indvar_next41 = add nsw i64 %polly.indvar40, 1 %polly.adjust_ub42 = sub i64 %39, 1 %polly.loop_cond43 = icmp sle i64 %polly.indvar40, %polly.adjust_ub42 br i1 %polly.loop_cond43, label %polly.loop_header36, label %polly.loop_exit38 polly.loop_preheader37: ; preds = %polly.loop_if35 br label %polly.loop_header36 polly.then62: ; preds = %polly.cond60 br label %polly.cond64 polly.cond64: ; preds = %polly.then62 %116 = mul nsw i64 32, %b0 %117 = add nsw i64 %116, %t0 %118 = mul nsw i64 8192, %polly.indvar %119 = add nsw i64 %117, %118 %120 = add nsw i64 %119, 1 %121 = icmp sge i64 %p_2, %120 br i1 %121, label %polly.then66, label %polly.else67 polly.merge65: ; preds = %polly.else67, %polly.loop_exit72 call void @llvm.nvvm.barrier0() br label %polly.merge61 polly.else63: ; preds = %polly.cond60 br label %polly.merge61 polly.then66: ; preds = %polly.cond64 %122 = mul nsw i64 -2, %b1 %123 = mul nsw i64 512, %polly.indvar5 %124 = sub nsw i64 %122, %123 %125 = sub nsw i64 %p_0, %t1 %126 = sub nsw i64 %125, 1 %polly.fdiv_q.shr68 = ashr i64 %126, 4 %127 = add nsw i64 %124, %polly.fdiv_q.shr68 %128 = icmp slt i64 1, %127 %129 = select i1 %128, i64 1, i64 %127 br label %polly.loop_if69 polly.loop_exit72: ; preds = %polly.stmt.if778, %polly.loop_if69 br label %polly.merge65 polly.else67: ; preds = %polly.cond64 br label %polly.merge65 polly.loop_if69: ; preds = %polly.then66 %polly.loop_guard73 = icmp sle i64 0, %129 br i1 %polly.loop_guard73, label %polly.loop_preheader71, label %polly.loop_exit72 polly.loop_header70: ; preds = %polly.stmt.if778, %polly.loop_preheader71 %polly.indvar74 = phi i64 [ 0, %polly.loop_preheader71 ], [ %polly.indvar_next75, %polly.stmt.if778 ] %130 = mul nsw i64 32, %b0 %131 = add nsw i64 %130, %t0 %132 = mul nsw i64 8192, %polly.indvar %133 = add nsw i64 %131, %132 %134 = mul nsw i64 32, %b1 %135 = add nsw i64 %134, %t1 %136 = mul nsw i64 8192, %polly.indvar5 %137 = add nsw i64 %135, %136 %138 = mul nsw i64 16, %polly.indvar74 %139 = add nsw i64 %137, %138 br label %polly.stmt.if778 polly.stmt.if778: ; preds = %polly.loop_header70 %polly.access.cast.MemRef079 = bitcast i8 addrspace(1)* %MemRef0 to i64 addrspace(1)* %140 = mul nsw i64 32, %b1 %141 = add nsw i64 %140, %t1 %142 = mul nsw i64 8192, %polly.indvar5 %143 = add nsw i64 %141, %142 %144 = mul nsw i64 16, %polly.indvar74 %145 = add nsw i64 %143, %144 %polly.access.mul.MemRef080 = mul nsw i64 %145, %p_3 %146 = mul nsw i64 32, %b0 %147 = add nsw i64 %146, %t0 %148 = mul nsw i64 8192, %polly.indvar %149 = add nsw i64 %147, %148 %polly.access.add.MemRef081 = add nsw i64 %polly.access.mul.MemRef080, %149 %polly.access.MemRef082 = getelementptr i64, i64 addrspace(1)* %polly.access.cast.MemRef079, i64 %polly.access.add.MemRef081 %_p_scalar_83 = load i64, i64 addrspace(1)* %polly.access.MemRef082, align 8 %p_84 = mul i64 %_p_scalar_83, %0, !dbg !1 %polly.access.cast.MemRef085 = bitcast i8 addrspace(1)* %MemRef0 to i64 addrspace(1)* %150 = mul nsw i64 32, %b1 %151 = add nsw i64 %150, %t1 %152 = mul nsw i64 8192, %polly.indvar5 %153 = add nsw i64 %151, %152 %154 = mul nsw i64 16, %polly.indvar74 %155 = add nsw i64 %153, %154 %polly.access.mul.MemRef086 = mul nsw i64 %155, %p_3 %156 = mul nsw i64 32, %b0 %157 = add nsw i64 %156, %t0 %158 = mul nsw i64 8192, %polly.indvar %159 = add nsw i64 %157, %158 %polly.access.add.MemRef087 = add nsw i64 %polly.access.mul.MemRef086, %159 %polly.access.MemRef088 = getelementptr i64, i64 addrspace(1)* %polly.access.cast.MemRef085, i64 %polly.access.add.MemRef087 store i64 %p_84, i64 addrspace(1)* %polly.access.MemRef088, align 8 %polly.indvar_next75 = add nsw i64 %polly.indvar74, 1 %polly.adjust_ub76 = sub i64 %129, 1 %polly.loop_cond77 = icmp sle i64 %polly.indvar74, %polly.adjust_ub76 br i1 %polly.loop_cond77, label %polly.loop_header70, label %polly.loop_exit72 polly.loop_preheader71: ; preds = %polly.loop_if69 br label %polly.loop_header70 } ; Function Attrs: nounwind readnone declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 ; Function Attrs: nounwind readnone declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #1 ; Function Attrs: nounwind readnone declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 ; Function Attrs: nounwind readnone declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #1 ; Function Attrs: convergent nounwind declare void @llvm.nvvm.barrier0() #2 attributes #0 = { "polly.skip.fn" } attributes #1 = { nounwind readnone } attributes #2 = { convergent nounwind } !nvvm.annotations = !{!0} !0 = !{void (i8 addrspace(1)*, i8 addrspace(1)*, i8 addrspace(1)*, i64, i64, i64, i64, i64, i64)* @kernel_0, !"maxntidx", i32 32, !"maxntidy", i32 16, !"maxntidz", i32 1} !1 = !DILocation(line: 6, scope: !2) !2 = distinct !DISubprogram(name: "kernel_gemm", linkageName: "julia_kernel_gemm_64340", scope: null, file: !3, type: !4, isLocal: false, isDefinition: true, isOptimized: true, unit: !11, variables: !13) !3 = !DIFile(filename: "REPL[1]", directory: ".") !4 = !DISubroutineType(types: !5) !5 = !{!6, !6, !7, !7, !7} !6 = !DIBasicType(name: "Int64", size: 64, encoding: DW_ATE_unsigned) !7 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !8, size: 64, align: 64) !8 = !DICompositeType(tag: DW_TAG_structure_type, name: "jl_value_t", file: !9, line: 71, align: 64, elements: !10) !9 = !DIFile(filename: "julia.h", directory: "") !10 = !{!7} !11 = distinct !DICompileUnit(language: DW_LANG_C89, file: !3, producer: "julia", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !12) !12 = !{} !13 = !{!14, !16, !17, !18, !19, !20, !21, !21, !22, !23, !24, !25, !26} !14 = !DILocalVariable(name: "#self#", arg: 1, scope: !2, file: !3, line: 2, type: !15) !15 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "#kernel_gemm", align: 8, elements: !12, runtimeLang: DW_LANG_Julia, identifier: "#kernel_gemm_64281") !16 = !DILocalVariable(name: "alpha", arg: 2, scope: !2, file: !3, line: 2, type: !6) !17 = !DILocalVariable(name: "beta", arg: 3, scope: !2, file: !3, line: 2, type: !6) !18 = !DILocalVariable(name: "C", arg: 4, scope: !2, file: !3, line: 2, type: !7) !19 = !DILocalVariable(name: "A", arg: 5, scope: !2, file: !3, line: 2, type: !7) !20 = !DILocalVariable(name: "B", arg: 6, scope: !2, file: !3, line: 2, type: !7) !21 = !DILocalVariable(name: "j", scope: !2, file: !3, line: 2, type: !6) !22 = !DILocalVariable(name: "k", scope: !2, file: !3, line: 2, type: !6) !23 = !DILocalVariable(name: "i", scope: !2, file: !3, line: 2, type: !6) !24 = !DILocalVariable(name: "ni", scope: !2, file: !3, line: 2, type: !6) !25 = !DILocalVariable(name: "nk", scope: !2, file: !3, line: 2, type: !6) !26 = !DILocalVariable(name: "nj", scope: !2, file: !3, line: 2, type: !6) !27 = !DILocation(line: 9, scope: !2)