Index: flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
===================================================================
--- flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
+++ flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
@@ -31,11 +31,14 @@
 #include "flang/Optimizer/Support/FIRContext.h"
 #include "flang/Optimizer/Transforms/Passes.h"
 #include "mlir/IR/Matchers.h"
+#include "mlir/IR/TypeUtilities.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/RegionUtils.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 
 #define DEBUG_TYPE "flang-simplify-intrinsics"
 
@@ -159,8 +162,13 @@
 /// with signature provided by \p funcOp. The caller is responsible
 /// for saving/restoring the original insertion point of \p builder.
 /// \p funcOp is expected to be empty on entry to this function.
+/// \p arg1ElementTy and \p arg2ElementTy specify elements types
+/// of the underlying array objects - they are used to generate proper
+/// element accesses.
 static void genFortranADotBody(fir::FirOpBuilder &builder,
-                               mlir::func::FuncOp &funcOp) {
+                               mlir::func::FuncOp &funcOp,
+                               mlir::Type arg1ElementTy,
+                               mlir::Type arg2ElementTy) {
   // function FortranADotProduct<T>_simplified(arr1, arr2)
   //   T, dimension(:) :: arr1, arr2
   //   T product = 0
@@ -171,14 +179,15 @@
   //   FortranADotProduct<T>_simplified = product
   // end function FortranADotProduct<T>_simplified
   auto loc = mlir::UnknownLoc::get(builder.getContext());
-  mlir::Type elementType = funcOp.getResultTypes()[0];
+  mlir::Type resultElementType = funcOp.getResultTypes()[0];
   builder.setInsertionPointToEnd(funcOp.addEntryBlock());
 
   mlir::IndexType idxTy = builder.getIndexType();
 
-  mlir::Value zero = elementType.isa<mlir::FloatType>()
-                         ? builder.createRealConstant(loc, elementType, 0.0)
-                         : builder.createIntegerConstant(loc, elementType, 0);
+  mlir::Value zero =
+      resultElementType.isa<mlir::FloatType>()
+          ? builder.createRealConstant(loc, resultElementType, 0.0)
+          : builder.createIntegerConstant(loc, resultElementType, 0);
 
   mlir::Block::BlockArgListType args = funcOp.front().getArguments();
   mlir::Value arg1 = args[0];
@@ -187,10 +196,12 @@
   mlir::Value zeroIdx = builder.createIntegerConstant(loc, idxTy, 0);
 
   fir::SequenceType::Shape flatShape = {fir::SequenceType::getUnknownExtent()};
-  mlir::Type arrTy = fir::SequenceType::get(flatShape, elementType);
-  mlir::Type boxArrTy = fir::BoxType::get(arrTy);
-  mlir::Value array1 = builder.create<fir::ConvertOp>(loc, boxArrTy, arg1);
-  mlir::Value array2 = builder.create<fir::ConvertOp>(loc, boxArrTy, arg2);
+  mlir::Type arrTy1 = fir::SequenceType::get(flatShape, arg1ElementTy);
+  mlir::Type boxArrTy1 = fir::BoxType::get(arrTy1);
+  mlir::Value array1 = builder.create<fir::ConvertOp>(loc, boxArrTy1, arg1);
+  mlir::Type arrTy2 = fir::SequenceType::get(flatShape, arg2ElementTy);
+  mlir::Type boxArrTy2 = fir::BoxType::get(arrTy2);
+  mlir::Value array2 = builder.create<fir::ConvertOp>(loc, boxArrTy2, arg2);
   // This version takes the loop trip count from the first argument.
   // If the first argument's box has unknown (at compilation time)
   // extent, then it may be better to take the extent from the second
@@ -216,19 +227,25 @@
   mlir::OpBuilder::InsertPoint loopEndPt = builder.saveInsertionPoint();
   builder.setInsertionPointToStart(loop.getBody());
 
-  mlir::Type eleRefTy = builder.getRefType(elementType);
+  mlir::Type eleRef1Ty = builder.getRefType(arg1ElementTy);
   mlir::Value index = loop.getInductionVar();
   mlir::Value addr1 =
-      builder.create<fir::CoordinateOp>(loc, eleRefTy, array1, index);
+      builder.create<fir::CoordinateOp>(loc, eleRef1Ty, array1, index);
   mlir::Value elem1 = builder.create<fir::LoadOp>(loc, addr1);
+  // Convert to the result type.
+  elem1 = builder.create<fir::ConvertOp>(loc, resultElementType, elem1);
+
+  mlir::Type eleRef2Ty = builder.getRefType(arg2ElementTy);
   mlir::Value addr2 =
-      builder.create<fir::CoordinateOp>(loc, eleRefTy, array2, index);
+      builder.create<fir::CoordinateOp>(loc, eleRef2Ty, array2, index);
   mlir::Value elem2 = builder.create<fir::LoadOp>(loc, addr2);
+  // Convert to the result type.
+  elem2 = builder.create<fir::ConvertOp>(loc, resultElementType, elem2);
 
-  if (elementType.isa<mlir::FloatType>())
+  if (resultElementType.isa<mlir::FloatType>())
     sumVal = builder.create<mlir::arith::AddFOp>(
         loc, builder.create<mlir::arith::MulFOp>(loc, elem1, elem2), sumVal);
-  else if (elementType.isa<mlir::IntegerType>())
+  else if (resultElementType.isa<mlir::IntegerType>())
     sumVal = builder.create<mlir::arith::AddIOp>(
         loc, builder.create<mlir::arith::MulIOp>(loc, elem1, elem2), sumVal);
   else
@@ -317,6 +334,29 @@
   return 0;
 }
 
+/// Given the call operation's box argument \p val, discover
+/// the element type of the underlying array object.
+/// \returns the element type or llvm::None if the type cannot
+/// be reliably found.
+/// We expect that the argument is a result of fir.convert
+/// with the destination type of !fir.box<none>.
+static llvm::Optional<mlir::Type> getArgElementType(mlir::Value val) {
+  mlir::Operation *defOp;
+  do {
+    defOp = val.getDefiningOp();
+    // Analyze only sequences of convert operations.
+    if (!mlir::isa<fir::ConvertOp>(defOp))
+      return llvm::None;
+    val = defOp->getOperand(0);
+    // The convert operation is expected to convert from one
+    // box type to another box type.
+    auto boxType = val.getType().cast<fir::BoxType>();
+    auto elementType = fir::unwrapSeqOrBoxedSeqType(boxType);
+    if (!elementType.isa<mlir::NoneType>())
+      return elementType;
+  } while (true);
+}
+
 void SimplifyIntrinsicsPass::runOnOperation() {
   LLVM_DEBUG(llvm::dbgs() << "=== Begin " DEBUG_TYPE " ===\n");
   mlir::ModuleOp module = getOperation();
@@ -380,11 +420,42 @@
           if (!type.isa<mlir::FloatType>() && !type.isa<mlir::IntegerType>())
             return;
 
+          // Try to find the element types of the boxed arguments.
+          auto arg1Type = getArgElementType(v1);
+          auto arg2Type = getArgElementType(v2);
+
+          if (!arg1Type || !arg2Type)
+            return;
+
+          // Support only floating point and integer arguments
+          // now (e.g. logical is skipped here).
+          if (!arg1Type->isa<mlir::FloatType>() &&
+              !arg1Type->isa<mlir::IntegerType>())
+            return;
+          if (!arg2Type->isa<mlir::FloatType>() &&
+              !arg2Type->isa<mlir::IntegerType>())
+            return;
+
           auto typeGenerator = [&type](fir::FirOpBuilder &builder) {
             return genFortranADotType(builder, type);
           };
+          auto bodyGenerator = [&arg1Type,
+                                &arg2Type](fir::FirOpBuilder &builder,
+                                           mlir::func::FuncOp &funcOp) {
+            genFortranADotBody(builder, funcOp, *arg1Type, *arg2Type);
+          };
+
+          // Suffix the function name with the element types
+          // of the arguments.
+          std::string typedFuncName(funcName);
+          llvm::raw_string_ostream nameOS(typedFuncName);
+          nameOS << "_";
+          arg1Type->print(nameOS);
+          nameOS << "_";
+          arg2Type->print(nameOS);
+
           mlir::func::FuncOp newFunc = getOrCreateFunction(
-              builder, funcName, typeGenerator, genFortranADotBody);
+              builder, typedFuncName, typeGenerator, bodyGenerator);
           auto newCall = builder.create<fir::CallOp>(loc, newFunc,
                                                      mlir::ValueRange{v1, v2});
           call->replaceAllUsesWith(newCall.getResults());
Index: flang/test/Transforms/simplifyintrinsics.fir
===================================================================
--- flang/test/Transforms/simplifyintrinsics.fir
+++ flang/test/Transforms/simplifyintrinsics.fir
@@ -344,15 +344,15 @@
 // CHECK:           %[[RESLOC:.*]] = fir.alloca f32 {bindc_name = "dot", uniq_name = "_QFdotEdot"}
 // CHECK:           %[[ACAST:.*]] = fir.convert %[[A]] : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
 // CHECK:           %[[BCAST:.*]] = fir.convert %[[B]] : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
-// CHECK:           %[[RES:.*]] = fir.call @_FortranADotProductReal4_simplified(%[[ACAST]], %[[BCAST]]) : (!fir.box<none>, !fir.box<none>) -> f32
+// CHECK:           %[[RES:.*]] = fir.call @_FortranADotProductReal4_f32_f32_simplified(%[[ACAST]], %[[BCAST]]) : (!fir.box<none>, !fir.box<none>) -> f32
 // CHECK:           fir.store %[[RES]] to %[[RESLOC]] : !fir.ref<f32>
 // CHECK:           %[[RET:.*]] = fir.load %[[RESLOC]] : !fir.ref<f32>
 // CHECK:           return %[[RET]] : f32
 // CHECK:         }
 
-// CHECK-LABEL:   func.func private @_FortranADotProductReal4_simplified(
-// CHECK-SAME:                                                           %[[A:.*]]: !fir.box<none>,
-// CHECK-SAME:                                                           %[[B:.*]]: !fir.box<none>) -> f32 attributes {llvm.linkage = #llvm.linkage<linkonce_odr>} {
+// CHECK-LABEL:   func.func private @_FortranADotProductReal4_f32_f32_simplified(
+// CHECK-SAME:      %[[A:.*]]: !fir.box<none>,
+// CHECK-SAME:      %[[B:.*]]: !fir.box<none>) -> f32 attributes {llvm.linkage = #llvm.linkage<linkonce_odr>} {
 // CHECK:           %[[FZERO:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK:           %[[IZERO:.*]] = arith.constant 0 : index
 // CHECK:           %[[ACAST:.*]] = fir.convert %[[A]] : (!fir.box<none>) -> !fir.box<!fir.array<?xf32>>
@@ -363,9 +363,11 @@
 // CHECK:           %[[RES:.*]] = fir.do_loop %[[IDX:.*]] = %[[IZERO]] to %[[LEN]] step %[[IONE]] iter_args(%[[SUM:.*]] = %[[FZERO]]) -> (f32) {
 // CHECK:             %[[ALOC:.*]] = fir.coordinate_of %[[ACAST]], %[[IDX]] : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
 // CHECK:             %[[AVAL:.*]] = fir.load %[[ALOC]] : !fir.ref<f32>
+// CHECK:             %[[AVALCAST:.*]] = fir.convert %[[AVAL]] : (f32) -> f32
 // CHECK:             %[[BLOC:.*]] = fir.coordinate_of %[[BCAST]], %[[IDX]] : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
 // CHECK:             %[[BVAL:.*]] = fir.load %[[BLOC]] : !fir.ref<f32>
-// CHECK:             %[[MUL:.*]] = arith.mulf %[[AVAL]], %[[BVAL]] : f32
+// CHECK:             %[[BVALCAST:.*]] = fir.convert %[[BVAL]] : (f32) -> f32
+// CHECK:             %[[MUL:.*]] = arith.mulf %[[AVALCAST]], %[[BVALCAST]] : f32
 // CHECK:             %[[NEWSUM:.*]] = arith.addf %[[MUL]], %[[SUM]] : f32
 // CHECK:             fir.result %[[NEWSUM]] : f32
 // CHECK:           }
@@ -479,15 +481,15 @@
 // CHECK:           %[[RESLOC:.*]] = fir.alloca i32 {bindc_name = "dot", uniq_name = "_QFdotEdot"}
 // CHECK:           %[[ACAST:.*]] = fir.convert %[[A]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
 // CHECK:           %[[BCAST:.*]] = fir.convert %[[B]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
-// CHECK:           %[[RES:.*]] = fir.call @_FortranADotProductInteger4_simplified(%[[ACAST]], %[[BCAST]]) : (!fir.box<none>, !fir.box<none>) -> i32
+// CHECK:           %[[RES:.*]] = fir.call @_FortranADotProductInteger4_i32_i32_simplified(%[[ACAST]], %[[BCAST]]) : (!fir.box<none>, !fir.box<none>) -> i32
 // CHECK:           fir.store %[[RES]] to %[[RESLOC]] : !fir.ref<i32>
 // CHECK:           %[[RET:.*]] = fir.load %[[RESLOC]] : !fir.ref<i32>
 // CHECK:           return %[[RET]] : i32
 // CHECK:         }
 
-// CHECK-LABEL:   func.func private @_FortranADotProductInteger4_simplified(
-// CHECK-SAME:                                                           %[[A:.*]]: !fir.box<none>,
-// CHECK-SAME:                                                           %[[B:.*]]: !fir.box<none>) -> i32 attributes {llvm.linkage = #llvm.linkage<linkonce_odr>} {
+// CHECK-LABEL:   func.func private @_FortranADotProductInteger4_i32_i32_simplified(
+// CHECK-SAME:      %[[A:.*]]: !fir.box<none>,
+// CHECK-SAME:      %[[B:.*]]: !fir.box<none>) -> i32 attributes {llvm.linkage = #llvm.linkage<linkonce_odr>} {
 // CHECK:           %[[I32ZERO:.*]] = arith.constant 0 : i32
 // CHECK:           %[[IZERO:.*]] = arith.constant 0 : index
 // CHECK:           %[[ACAST:.*]] = fir.convert %[[A]] : (!fir.box<none>) -> !fir.box<!fir.array<?xi32>>
@@ -498,9 +500,11 @@
 // CHECK:           %[[RES:.*]] = fir.do_loop %[[IDX:.*]] = %[[IZERO]] to %[[LEN]] step %[[IONE]] iter_args(%[[SUM:.*]] = %[[I32ZERO]]) -> (i32) {
 // CHECK:             %[[ALOC:.*]] = fir.coordinate_of %[[ACAST]], %[[IDX]] : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
 // CHECK:             %[[AVAL:.*]] = fir.load %[[ALOC]] : !fir.ref<i32>
+// CHECK:             %[[AVALCAST:.*]] = fir.convert %[[AVAL]] : (i32) -> i32
 // CHECK:             %[[BLOC:.*]] = fir.coordinate_of %[[BCAST]], %[[IDX]] : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
 // CHECK:             %[[BVAL:.*]] = fir.load %[[BLOC]] : !fir.ref<i32>
-// CHECK:             %[[MUL:.*]] = arith.muli %[[AVAL]], %[[BVAL]] : i32
+// CHECK:             %[[BVALCAST:.*]] = fir.convert %[[BVAL]] : (i32) -> i32
+// CHECK:             %[[MUL:.*]] = arith.muli %[[AVALCAST]], %[[BVALCAST]] : i32
 // CHECK:             %[[NEWSUM:.*]] = arith.addi %[[MUL]], %[[SUM]] : i32
 // CHECK:             fir.result %[[NEWSUM]] : i32
 // CHECK:           }
@@ -587,3 +591,63 @@
 // CHECK-SAME:                      %[[A:.*]]: !fir.box<!fir.array<?xi64>> {fir.bindc_name = "a"},
 // CHECK-SAME:                      %[[B:.*]]: !fir.box<!fir.array<?xi64>> {fir.bindc_name = "b"}) -> i64 {
 // CHECK-NOT: call{{.*}}_FortranADotProductInteger8(
+
+// -----
+
+// Test mixed types, e.g. when _FortranADotProductReal8 is called
+// with <?xf64> and <?xf32> arguments. The loaded elements must be converted
+// to the result type REAL(8) before the computations.
+
+func.func @dot_f64_f32(%arg0: !fir.box<!fir.array<?xf64>> {fir.bindc_name = "a"}, %arg1: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "b"}) -> f64 {
+  %0 = fir.alloca f64 {bindc_name = "dot", uniq_name = "_QFdotEdot"}
+  %1 = fir.address_of(@_QQcl.2E2F646F742E66393000) : !fir.ref<!fir.char<1,10>>
+  %c3_i32 = arith.constant 3 : i32
+  %2 = fir.convert %arg0 : (!fir.box<!fir.array<?xf64>>) -> !fir.box<none>
+  %3 = fir.convert %arg1 : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
+  %4 = fir.convert %1 : (!fir.ref<!fir.char<1,10>>) -> !fir.ref<i8>
+  %5 = fir.call @_FortranADotProductReal8(%2, %3, %4, %c3_i32) : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> f64
+  fir.store %5 to %0 : !fir.ref<f64>
+  %6 = fir.load %0 : !fir.ref<f64>
+  return %6 : f64
+}
+func.func private @_FortranADotProductReal4(!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> f32 attributes {fir.runtime}
+fir.global linkonce @_QQcl.2E2F646F742E66393000 constant : !fir.char<1,10> {
+  %0 = fir.string_lit "./dot.f90\00"(10) : !fir.char<1,10>
+  fir.has_value %0 : !fir.char<1,10>
+}
+
+// CHECK-LABEL:   func.func @dot_f64_f32(
+// CHECK-SAME:      %[[A:.*]]: !fir.box<!fir.array<?xf64>> {fir.bindc_name = "a"},
+// CHECK-SAME:      %[[B:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "b"}) -> f64 {
+// CHECK:           %[[RESLOC:.*]] = fir.alloca f64 {bindc_name = "dot", uniq_name = "_QFdotEdot"}
+// CHECK:           %[[ACAST:.*]] = fir.convert %[[A]] : (!fir.box<!fir.array<?xf64>>) -> !fir.box<none>
+// CHECK:           %[[BCAST:.*]] = fir.convert %[[B]] : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
+// CHECK:           %[[RES:.*]] = fir.call @_FortranADotProductReal8_f64_f32_simplified(%[[ACAST]], %[[BCAST]]) : (!fir.box<none>, !fir.box<none>) -> f64
+// CHECK:           fir.store %[[RES]] to %[[RESLOC]] : !fir.ref<f64>
+// CHECK:           %[[RET:.*]] = fir.load %[[RESLOC]] : !fir.ref<f64>
+// CHECK:           return %[[RET]] : f64
+// CHECK:         }
+
+// CHECK-LABEL:   func.func private @_FortranADotProductReal8_f64_f32_simplified(
+// CHECK-SAME:      %[[A:.*]]: !fir.box<none>,
+// CHECK-SAME:      %[[B:.*]]: !fir.box<none>) -> f64 attributes {llvm.linkage = #llvm.linkage<linkonce_odr>} {
+// CHECK:           %[[FZERO:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK:           %[[IZERO:.*]] = arith.constant 0 : index
+// CHECK:           %[[ACAST:.*]] = fir.convert %[[A]] : (!fir.box<none>) -> !fir.box<!fir.array<?xf64>>
+// CHECK:           %[[BCAST:.*]] = fir.convert %[[B]] : (!fir.box<none>) -> !fir.box<!fir.array<?xf32>>
+// CHECK:           %[[DIMS:.*]]:3 = fir.box_dims %[[ACAST]], %[[IZERO]] : (!fir.box<!fir.array<?xf64>>, index) -> (index, index, index)
+// CHECK:           %[[IONE:.*]] = arith.constant 1 : index
+// CHECK:           %[[LEN:.*]] = arith.subi %[[DIMS]]#1, %[[IONE]] : index
+// CHECK:           %[[RES:.*]] = fir.do_loop %[[IDX:.*]] = %[[IZERO]] to %[[LEN]] step %[[IONE]] iter_args(%[[SUM:.*]] = %[[FZERO]]) -> (f64) {
+// CHECK:             %[[ALOC:.*]] = fir.coordinate_of %[[ACAST]], %[[IDX]] : (!fir.box<!fir.array<?xf64>>, index) -> !fir.ref<f64>
+// CHECK:             %[[AVAL:.*]] = fir.load %[[ALOC]] : !fir.ref<f64>
+// CHECK:             %[[AVALCAST:.*]] = fir.convert %[[AVAL]] : (f64) -> f64
+// CHECK:             %[[BLOC:.*]] = fir.coordinate_of %[[BCAST]], %[[IDX]] : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+// CHECK:             %[[BVAL:.*]] = fir.load %[[BLOC]] : !fir.ref<f32>
+// CHECK:             %[[BVALCAST:.*]] = fir.convert %[[BVAL]] : (f32) -> f64
+// CHECK:             %[[MUL:.*]] = arith.mulf %[[AVALCAST]], %[[BVALCAST]] : f64
+// CHECK:             %[[NEWSUM:.*]] = arith.addf %[[MUL]], %[[SUM]] : f64
+// CHECK:             fir.result %[[NEWSUM]] : f64
+// CHECK:           }
+// CHECK:           return %[[RES]] : f64
+// CHECK:         }