Index: lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -152,8 +152,10 @@ for (MachineInstr *InstToMove : InstsToMove) { if (!InstToMove->mayLoadOrStore()) continue; - if (!TII->areMemAccessesTriviallyDisjoint(MemOp, *InstToMove, AA)) - return false; + if (!TII->areMemAccessesTriviallyDisjoint(MemOp, *InstToMove, AA)) { + if (MemOp.mayStore() || InstToMove->mayStore()) + return false; + } } return true; } @@ -212,12 +214,19 @@ return E; if (MBBI->mayLoadOrStore() && - !TII->areMemAccessesTriviallyDisjoint(*I, *MBBI, AA)) { - // We fail condition #1, but we may still be able to satisfy condition - // #2. Add this instruction to the move list and then we will check - // if condition #2 holds once we have selected the matching instruction. - InstsToMove.push_back(&*MBBI); - addDefsToList(*MBBI, DefsToMove); + !TII->areMemAccessesTriviallyDisjoint(*I, *MBBI, AA)) { + /* + RAW or WAR - cannot reorder + WAW - cannot reorder + RAR - safe to reorder + */ + if (MBBI->mayStore() || I->mayStore()) { + // We fail condition #1, but we may still be able to satisfy condition + // #2. Add this instruction to the move list and then we will check + // if condition #2 holds once we have selected the matching instruction. + InstsToMove.push_back(&*MBBI); + addDefsToList(*MBBI, DefsToMove); + } continue; } @@ -269,7 +278,9 @@ // it was safe to move I and also all the instruction in InstsToMove // down past this instruction. // FIXME: This is too conservative. - break; + if (!TII->areMemAccessesTriviallyDisjoint(*I, *MBBI, AA) && + (I->mayStore() || MBBI->mayStore())) + break; } return E; } Index: test/CodeGen/AMDGPU/cgemm_loopunroll_ds_combine.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/cgemm_loopunroll_ds_combine.ll @@ -0,0 +1,533 @@ +; RUN: llc -O2 -mtriple amdgcn--amdhsa -mcpu=fiji -filetype=asm < %s | FileCheck -check-prefix=SI %s +; SI-LABEL: _ZZ26cgemm_NoTransAB_loopunrollRN2 +; SI-NOT: ds_read_b32 + +; ModuleID = 'fiji.opt.bc' +source_filename = "llvm-link" +target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" +target triple = "amdgcn--amdhsa" + +; Function Attrs: nounwind +declare void @llvm.amdgcn.s.waitcnt(i32) #2 + +; Function Attrs: nounwind +declare void @llvm.amdgcn.s.dcache.wb() #2 + +; Function Attrs: convergent nounwind +declare void @llvm.amdgcn.s.barrier() #3 + +; Function Attrs: nounwind readnone +declare float @llvm.fabs.f32(float) #4 + +; Function Attrs: nounwind readnone +declare i1 @llvm.amdgcn.class.f32(float, i32) #4 + +; Function Attrs: nounwind readnone +declare i1 @llvm.amdgcn.class.f64(double, i32) #4 + +; Function Attrs: nounwind readnone +declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #4 + +; Function Attrs: nounwind readnone +declare i32 @llvm.amdgcn.workitem.id.x() #4 + +; Function Attrs: nounwind readnone +declare i32 @llvm.amdgcn.workgroup.id.x() #4 + +; Function Attrs: nounwind readnone +declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #4 + +; Function Attrs: nounwind readnone +declare i32 @llvm.amdgcn.workitem.id.y() #4 + +; Function Attrs: nounwind readnone +declare i32 @llvm.amdgcn.workgroup.id.y() #4 + +; Function Attrs: nounwind readnone +declare i32 @llvm.amdgcn.workgroup.id.z() #4 + +%"class.hc::short_vector::float_2.0" = type { float, float } + +@"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Bsreal.2" = internal unnamed_addr addrspace(3) global [16 x [16 x float]] undef, align 4 + +@"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Bsimg.3" = internal unnamed_addr addrspace(3) global [16 x [16 x float]] undef, align 4 + +@"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Asreal.4" = internal unnamed_addr addrspace(3) global [16 x [16 x float]] undef, align 4 + +@"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Asimg.5" = internal unnamed_addr addrspace(3) global [16 x [16 x float]] undef, align 4 + +define amdgpu_kernel void @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff"(i32, i32, %"class.hc::short_vector::float_2.0" addrspace(1)* nocapture readonly, i64, i32, i32, %"class.hc::short_vector::float_2.0" addrspace(1)* nocapture readonly, i64, i32, %"class.hc::short_vector::float_2.0" addrspace(1)* nocapture, i64, i32, float, float, float, float) local_unnamed_addr #0 align 2 { + %17 = tail call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #4 + %18 = tail call i32 @llvm.amdgcn.workitem.id.y() #4 + %19 = tail call i32 @llvm.amdgcn.workgroup.id.y() #4 + %20 = getelementptr inbounds i8, i8 addrspace(2)* %17, i64 6 + %21 = bitcast i8 addrspace(2)* %20 to i16 addrspace(2)* + %22 = load i16, i16 addrspace(2)* %21, align 2, !tbaa !232 + %23 = zext i16 %22 to i32 + %24 = mul i32 %23, %19 + %25 = add i32 %24, %18 + %26 = tail call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #4 + %27 = getelementptr inbounds i8, i8 addrspace(2)* %26, i64 8 + %28 = zext i32 %25 to i64 + %29 = bitcast i8 addrspace(2)* %27 to i64 addrspace(2)* + %30 = load i64, i64 addrspace(2)* %29, align 8, !tbaa !241 + %31 = add i64 %28, %30 + %32 = tail call i32 @llvm.amdgcn.workitem.id.x() #4 + %33 = tail call i32 @llvm.amdgcn.workgroup.id.x() #4 + %34 = getelementptr inbounds i8, i8 addrspace(2)* %17, i64 4 + %35 = bitcast i8 addrspace(2)* %34 to i16 addrspace(2)* + %36 = load i16, i16 addrspace(2)* %35, align 4, !tbaa !242 + %37 = zext i16 %36 to i32 + %38 = mul i32 %37, %33 + %39 = add i32 %38, %32 + %40 = zext i32 %39 to i64 + %41 = bitcast i8 addrspace(2)* %26 to i64 addrspace(2)* + %42 = load i64, i64 addrspace(2)* %41, align 8, !tbaa !241 + %conv.i.i.i = trunc i64 %31 to i32 + %mul.i = shl nsw i32 %19, 4 + %add.i = add nsw i32 %mul.i, %18 + %mul5.i = shl nsw i32 %33, 4 + %add8.i = add nsw i32 %mul5.i, %32 + %cmp1281.i = icmp sgt i32 %0, 0 + br i1 %cmp1281.i, label %.lr.ph.i, label %._crit_edge.i + +.lr.ph.i: ; preds = %16 + %conv781288.i = zext i32 %add8.i to i64 + %cmp15.i = icmp slt i32 %add.i, %1 + %mul16.i = mul nsw i32 %add.i, %4 + %conv1115.i = zext i32 %mul16.i to i64 + %add17.i = add i64 %conv1115.i, %3 + %43 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Bsreal.2", i32 0, i32 %18, i32 %32 + %44 = bitcast float addrspace(3)* %43 to i32 addrspace(3)* + %45 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Bsimg.3", i32 0, i32 %18, i32 %32 + %cmp70.i = icmp slt i32 %add8.i, %5 + %add77.i = add i64 %conv781288.i, %7 + %46 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Asreal.4", i32 0, i32 %18, i32 %32 + %47 = bitcast float addrspace(3)* %46 to i32 addrspace(3)* + %48 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Asimg.5", i32 0, i32 %18, i32 %32 + %49 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Bsreal.2", i32 0, i32 %18, i32 0 + %50 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Asreal.4", i32 0, i32 0, i32 %32 + %51 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Bsimg.3", i32 0, i32 %18, i32 0 + %52 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Asimg.5", i32 0, i32 0, i32 %32 + %53 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Bsreal.2", i32 0, i32 %18, i32 1 + %54 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Asreal.4", i32 0, i32 1, i32 %32 + %55 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Bsimg.3", i32 0, i32 %18, i32 1 + %56 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Asimg.5", i32 0, i32 1, i32 %32 + %57 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Bsreal.2", i32 0, i32 %18, i32 2 + %58 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Asreal.4", i32 0, i32 2, i32 %32 + %59 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Bsimg.3", i32 0, i32 %18, i32 2 + %60 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Asimg.5", i32 0, i32 2, i32 %32 + %61 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Bsreal.2", i32 0, i32 %18, i32 3 + %62 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Asreal.4", i32 0, i32 3, i32 %32 + %63 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Bsimg.3", i32 0, i32 %18, i32 3 + %64 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Asimg.5", i32 0, i32 3, i32 %32 + %65 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Bsreal.2", i32 0, i32 %18, i32 4 + %66 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Asreal.4", i32 0, i32 4, i32 %32 + %67 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Bsimg.3", i32 0, i32 %18, i32 4 + %68 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Asimg.5", i32 0, i32 4, i32 %32 + %69 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Bsreal.2", i32 0, i32 %18, i32 5 + %70 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Asreal.4", i32 0, i32 5, i32 %32 + %71 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Bsimg.3", i32 0, i32 %18, i32 5 + %72 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Asimg.5", i32 0, i32 5, i32 %32 + %73 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Bsreal.2", i32 0, i32 %18, i32 6 + %74 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Asreal.4", i32 0, i32 6, i32 %32 + %75 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Bsimg.3", i32 0, i32 %18, i32 6 + %76 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Asimg.5", i32 0, i32 6, i32 %32 + %77 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Bsreal.2", i32 0, i32 %18, i32 7 + %78 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Asreal.4", i32 0, i32 7, i32 %32 + %79 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Bsimg.3", i32 0, i32 %18, i32 7 + %80 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Asimg.5", i32 0, i32 7, i32 %32 + %81 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Bsreal.2", i32 0, i32 %18, i32 8 + %82 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Asreal.4", i32 0, i32 8, i32 %32 + %83 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Bsimg.3", i32 0, i32 %18, i32 8 + %84 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Asimg.5", i32 0, i32 8, i32 %32 + %85 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Bsreal.2", i32 0, i32 %18, i32 9 + %86 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Asreal.4", i32 0, i32 9, i32 %32 + %87 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Bsimg.3", i32 0, i32 %18, i32 9 + %88 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Asimg.5", i32 0, i32 9, i32 %32 + %89 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Bsreal.2", i32 0, i32 %18, i32 10 + %90 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Asreal.4", i32 0, i32 10, i32 %32 + %91 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Bsimg.3", i32 0, i32 %18, i32 10 + %92 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Asimg.5", i32 0, i32 10, i32 %32 + %93 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Bsreal.2", i32 0, i32 %18, i32 11 + %94 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Asreal.4", i32 0, i32 11, i32 %32 + %95 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Bsimg.3", i32 0, i32 %18, i32 11 + %96 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Asimg.5", i32 0, i32 11, i32 %32 + %97 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Bsreal.2", i32 0, i32 %18, i32 12 + %98 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Asreal.4", i32 0, i32 12, i32 %32 + %99 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Bsimg.3", i32 0, i32 %18, i32 12 + %100 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Asimg.5", i32 0, i32 12, i32 %32 + %101 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Bsreal.2", i32 0, i32 %18, i32 13 + %102 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Asreal.4", i32 0, i32 13, i32 %32 + %103 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Bsimg.3", i32 0, i32 %18, i32 13 + %104 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Asimg.5", i32 0, i32 13, i32 %32 + %105 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Bsreal.2", i32 0, i32 %18, i32 14 + %106 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Asreal.4", i32 0, i32 14, i32 %32 + %107 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Bsimg.3", i32 0, i32 %18, i32 14 + %108 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Asimg.5", i32 0, i32 14, i32 %32 + %109 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Bsreal.2", i32 0, i32 %18, i32 15 + %110 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE6Asreal.4", i32 0, i32 15, i32 %32 + %111 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Bsimg.3", i32 0, i32 %18, i32 15 + %112 = getelementptr [16 x [16 x float]], [16 x [16 x float]] addrspace(3)* @"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_EN3$_019__cxxamp_trampolineEiiS4_liiS4_liS4_liffff._ZZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEEE5Asimg.5", i32 0, i32 15, i32 %32 + %add9.i = add nuw nsw i32 %0, 15 + %div.i = sdiv i32 %add9.i, 16 + br label %._crit_edge1286.i + +._crit_edge.i.loopexit: ; preds = %130 + br label %._crit_edge.i + +._crit_edge.i: ; preds = %._crit_edge.i.loopexit, %16 + %CValue1.0.lcssa.i = phi float [ 0.000000e+00, %16 ], [ %add827.i, %._crit_edge.i.loopexit ] + %CValue.0.lcssa.i = phi float [ 0.000000e+00, %16 ], [ %add475.i, %._crit_edge.i.loopexit ] + %cmp829.i = icmp slt i32 %add.i, %1 + %cmp830.i = icmp slt i32 %add8.i, %5 + %or.cond34 = and i1 %cmp829.i, %cmp830.i + br i1 %or.cond34, label %195, label %"_ZZ26cgemm_NoTransAB_loopunrollRN2hc16accelerator_viewEPNS_12short_vector7float_2ElS4_lS4_liiiiiiS3_S3_ENK3$_0clERNS_11tiled_indexILi2EEE.exit" + +._crit_edge1286.i: ; preds = %130, %.lr.ph.i + %k.01284.i = phi i32 [ 0, %.lr.ph.i ], [ %inc.i, %130 ] + %CValue.01283.i = phi float [ 0.000000e+00, %.lr.ph.i ], [ %add475.i, %130 ] + %CValue1.01282.i = phi float [ 0.000000e+00, %.lr.ph.i ], [ %add827.i, %130 ] + %mul10.i = shl nsw i32 %k.01284.i, 4 + %add13.i = add nsw i32 %mul10.i, %32 + %cmp14.i = icmp slt i32 %add13.i, %0 + %or.cond = and i1 %cmp15.i, %cmp14.i + br i1 %or.cond, label %113, label %120 + +;