From 31b53c08f86b56b4762ab16a118db9d61ed488df Mon Sep 17 00:00:00 2001
From: vchamarthi <venkata.sai.gireesh.chamarthi@intel.com>
Date: Mon, 29 Jun 2026 19:40:13 -0500
Subject: [PATCH 1/3] Speed up in-place vector-to-C-contiguous-matrix broadcast
 on CPU

In-place binary elementwise ops broadcasting a vector against a
C-contiguous matrix (m += row, m += col[:, None]) fell through to the
general strided kernel on CPU, although a vectorized row-broadcast
kernel already exists and is used by the out-of-place path.

- Add the missing C-contiguous row-broadcast dispatch branch to
  py_binary_inplace_ufunc (reuses the existing
  BinaryInplaceRowMatrixBroadcastingFunctor); the in-place template
  previously only had the F-style {1,0} branch while the out-of-place
  path already handled the {0,1} C-contiguous case.
- Add BinaryInplaceColMatrixBroadcastingFunctor for the column case
  (mat[gid] += vec[gid / n1]) and wire it for add via a defaulted extra
  template parameter, keeping all other in-place ufuncs unchanged.

Both paths are guarded by exact simplified-stride checks and fall back
to the strided kernel otherwise. Results are bitwise-identical.

Adds TestAdd::test_inplace_row_broadcast and
TestAdd::test_inplace_column_broadcast covering several shapes (incl.
row lengths not a multiple of the sub-group size) across dtypes.
---
 CHANGELOG.md                                  |  1 +
 .../kernels/elementwise_functions/add.hpp     | 53 ++++++++++++++
 .../elementwise_functions/common_inplace.hpp  | 72 +++++++++++++++++++
 .../source/elementwise_functions/add.cpp      | 18 ++++-
 .../elementwise_functions.hpp                 | 62 +++++++++++++++-
 dpnp/tests/test_binary_ufuncs.py              | 38 ++++++++++
 6 files changed, 241 insertions(+), 3 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 27c8329caa19..0a7aa076c3fc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,6 +17,7 @@ This release is compatible with NumPy 2.5.
 
 ### Changed
 
+* Improved performance of in-place binary elementwise operations broadcasting a vector against a C-contiguous matrix (e.g. `m += row`, `m += col[:, None]`) on CPU by dispatching to vectorized broadcast kernels instead of the general strided kernel [#XXXX](https://github.com/IntelPython/dpnp/pull/XXXX)
 * Changed `dpnp.meshgrid` and `dpnp.tensor.meshgrid` to return a tuple instead of a list, aligning with NumPy 2.5+ behavior and 2025.12 version of the Python array API standard [#2854](https://github.com/IntelPython/dpnp/pull/2854)
 * Updated `searchsorted` implementations to align with the 2025.12 array API spec [#2902](https://github.com/IntelPython/dpnp/pull/2902)
 * Updated tests to align with NumPy 2.4.5 compatibility [#2920](https://github.com/IntelPython/dpnp/pull/2920)
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/add.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
index a4d5b4252e99..0aa81dc67ed6 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
@@ -676,4 +676,57 @@ struct AddInplaceRowMatrixBroadcastFactory
     }
 };
 
+// In-place column-broadcast wiring for add.
+template <typename argT, typename resT>
+class add_inplace_col_matrix_broadcast_krn;
+
+template <typename argT, typename resT>
+using AddInplaceColMatrixBroadcastingFunctor =
+    elementwise_common::BinaryInplaceColMatrixBroadcastingFunctor<
+        argT,
+        resT,
+        AddInplaceFunctor<argT, resT>>;
+
+template <typename argT, typename resT>
+sycl::event add_inplace_col_matrix_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *vec_p,
+    ssize_t vec_offset,
+    char *mat_p,
+    ssize_t mat_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_inplace_col_matrix_broadcast_impl<
+        argT, resT, AddInplaceColMatrixBroadcastingFunctor,
+        add_inplace_col_matrix_broadcast_krn>(exec_q, host_tasks, n0, n1,
+                                              vec_p, vec_offset, mat_p,
+                                              mat_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct AddInplaceColMatrixBroadcastFactory
+{
+    fnT get()
+    {
+        if constexpr (!AddInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            if constexpr (dpnp::tensor::type_utils::is_complex<T1>::value ||
+                          dpnp::tensor::type_utils::is_complex<T2>::value) {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn = add_inplace_col_matrix_broadcast_impl<T1, T2>;
+                return fn;
+            }
+        }
+    }
+};
+
 } // namespace dpnp::tensor::kernels::add
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
index 9384ec603754..a87d0e0ae86e 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
@@ -471,4 +471,76 @@ sycl::event binary_inplace_row_matrix_broadcast_impl(
     return comp_ev;
 }
 
+// In-place column-broadcast: C-contiguous matrix += column vector.
+// Scalar kernel: one work-item per element, mat[gid] += vec[gid / n1] (n1 = row
+// length so gid/n1 = row index). No sub-group collectives -> correct for any n1.
+template <typename argT, typename resT, typename BinaryOperatorT>
+struct BinaryInplaceColMatrixBroadcastingFunctor
+{
+private:
+    const argT *vec; // (n0,) contiguous column vector, one scalar per row
+    resT *mat;       // (n0, n1) C-contiguous matrix
+    std::size_t n1;  // contiguous row length
+
+public:
+    BinaryInplaceColMatrixBroadcastingFunctor(const argT *col_tp,
+                                              resT *mat_tp,
+                                              std::size_t n_elems_in_row)
+        : vec(col_tp), mat(mat_tp), n1(n_elems_in_row)
+    {
+    }
+
+    void operator()(sycl::id<1> wid) const
+    {
+        BinaryOperatorT op{};
+        const std::size_t gid = wid.get(0);
+        op(mat[gid], vec[gid / n1]);
+    }
+};
+
+typedef sycl::event (*binary_inplace_col_matrix_broadcast_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::vector<sycl::event> &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename argT,
+          typename resT,
+          template <typename T1, typename T3>
+          class BinaryInplaceColMatrixBroadcastFunctorT,
+          template <typename T1, typename T3> class kernel_name>
+sycl::event binary_inplace_col_matrix_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *vec_p,
+    ssize_t vec_offset,
+    char *mat_p,
+    ssize_t mat_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    const argT *vec = reinterpret_cast<const argT *>(vec_p) + vec_offset;
+    resT *mat = reinterpret_cast<resT *>(mat_p) + mat_offset;
+    (void)host_tasks; // no padding/temporary needed for column broadcast
+
+    const std::size_t n_elems = n0 * n1;
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using Impl = BinaryInplaceColMatrixBroadcastFunctorT<argT, resT>;
+
+        cgh.parallel_for<class kernel_name<argT, resT>>(
+            sycl::range<1>(n_elems), Impl(vec, mat, n1));
+    });
+
+    return comp_ev;
+}
+
 } // namespace dpnp::tensor::kernels::elementwise_common
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/add.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/add.cpp
index 7ba2fd8a37f3..99e7bd6a92bc 100644
--- a/dpnp/tensor/libtensor/source/elementwise_functions/add.cpp
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/add.cpp
@@ -65,6 +65,7 @@ using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
 
 using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
 using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_col_matrix_broadcast_impl_fn_ptr_t;
 using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
 
 // B01: ===== ADD (x1, x2)
@@ -98,6 +99,8 @@ static binary_inplace_strided_impl_fn_ptr_t
     add_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
 static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t
     add_inplace_row_matrix_dispatch_table[td_ns::num_types][td_ns::num_types];
+static binary_inplace_col_matrix_broadcast_impl_fn_ptr_t
+    add_inplace_col_matrix_dispatch_table[td_ns::num_types][td_ns::num_types];
 
 void populate_add_dispatch_tables(void)
 {
@@ -165,6 +168,14 @@ void populate_add_dispatch_tables(void)
         dtb8;
     dtb8.populate_dispatch_table(add_inplace_row_matrix_dispatch_table);
 
+    // function pointers for the in-place c-contig matrix += column
+    // broadcast operation
+    using fn_ns::AddInplaceColMatrixBroadcastFactory;
+    DispatchTableBuilder<binary_inplace_col_matrix_broadcast_impl_fn_ptr_t,
+                         AddInplaceColMatrixBroadcastFactory, num_types>
+        dtb10;
+    dtb10.populate_dispatch_table(add_inplace_col_matrix_dispatch_table);
+
     // which types are supported by the in-place kernels
     using fn_ns::AddInplaceTypeMapFactory;
     DispatchTableBuilder<int, AddInplaceTypeMapFactory, num_types> dtb9;
@@ -216,6 +227,7 @@ void init_add(py::module_ m)
         using impl::add_inplace_contig_dispatch_table;
         using impl::add_inplace_output_id_table;
         using impl::add_inplace_row_matrix_dispatch_table;
+        using impl::add_inplace_col_matrix_dispatch_table;
         using impl::add_inplace_strided_dispatch_table;
 
         auto add_inplace_pyapi = [&](const arrayT &src, const arrayT &dst,
@@ -232,7 +244,11 @@ void init_add(py::module_ m)
                 // function pointers to handle inplace operation on
                 // c-contig matrix with c-contig row with broadcasting
                 // (may be nullptr)
-                add_inplace_row_matrix_dispatch_table);
+                add_inplace_row_matrix_dispatch_table,
+                // function pointers to handle inplace operation on
+                // c-contig matrix with c-contig column with broadcasting
+                // (may be nullptr)
+                add_inplace_col_matrix_dispatch_table);
         };
         m.def("_add_inplace", add_inplace_pyapi, "", py::arg("lhs"),
               py::arg("rhs"), py::arg("sycl_queue"),
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
index a3924c8d28a6..9263d0213891 100644
--- a/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
@@ -607,7 +607,10 @@ py::object py_binary_ufunc_result_type(const py::dtype &input1_dtype,
 template <typename output_typesT,
           typename contig_dispatchT,
           typename strided_dispatchT,
-          typename contig_row_matrix_dispatchT>
+          typename contig_row_matrix_dispatchT,
+          // Optional table for the C-contiguous matrix += column broadcast
+          // case; defaulted so existing callers stay source-compatible.
+          typename contig_col_matrix_dispatchT = std::nullptr_t>
 std::pair<sycl::event, sycl::event>
     py_binary_inplace_ufunc(const dpnp::tensor::usm_ndarray &lhs,
                             const dpnp::tensor::usm_ndarray &rhs,
@@ -618,7 +621,10 @@ std::pair<sycl::event, sycl::event>
                             const contig_dispatchT &contig_dispatch_table,
                             const strided_dispatchT &strided_dispatch_table,
                             const contig_row_matrix_dispatchT
-                                &contig_row_matrix_broadcast_dispatch_table)
+                                &contig_row_matrix_broadcast_dispatch_table,
+                            const contig_col_matrix_dispatchT
+                                &contig_col_matrix_broadcast_dispatch_table =
+                                    nullptr)
 {
     dpnp::tensor::validation::CheckWritable::throw_if_not_writable(lhs);
 
@@ -745,9 +751,61 @@ std::pair<sycl::event, sycl::event>
             }
         }
         if (nd == 2) {
+            static constexpr auto zero_one_strides =
+                std::initializer_list<py::ssize_t>{0, 1};
             static constexpr auto one_zero_strides =
                 std::initializer_list<py::ssize_t>{1, 0};
             static constexpr py::ssize_t one{1};
+            // C-contiguous matrix (lhs) and a row (rhs): D(N0,N1) += row(N1,)
+            // lhs strides {N1,1} = {shape[1],1}, rhs (row broadcast) strides {0,1}
+            if (isEqual(simplified_rhs_strides, zero_one_strides) &&
+                isEqual(simplified_lhs_strides, {simplified_shape[1], one})) {
+                auto row_matrix_broadcast_fn =
+                    contig_row_matrix_broadcast_dispatch_table[rhs_typeid]
+                                                              [lhs_typeid];
+                if (row_matrix_broadcast_fn != nullptr) {
+                    int rhs_itemsize = rhs.get_elemsize();
+                    int lhs_itemsize = lhs.get_elemsize();
+                    if (is_aligned<required_alignment>(
+                            rhs_data + rhs_offset * rhs_itemsize) &&
+                        is_aligned<required_alignment>(
+                            lhs_data + lhs_offset * lhs_itemsize)) {
+                        std::size_t n0 = simplified_shape[0];
+                        std::size_t n1 = simplified_shape[1];
+                        sycl::event comp_ev = row_matrix_broadcast_fn(
+                            exec_q, host_tasks, n0, n1, rhs_data, rhs_offset,
+                            lhs_data, lhs_offset, depends);
+
+                        return std::make_pair(
+                            dpnp::utils::keep_args_alive(
+                                exec_q, {lhs, rhs}, host_tasks),
+                            comp_ev);
+                    }
+                }
+            }
+            // C-contiguous matrix (lhs) and a column (rhs): D(N0,N1) += col(N0,1)
+            // rhs(col broadcast) strides {1,0}; lhs(C-contig) {shape[1],1}
+            if constexpr (!std::is_same_v<contig_col_matrix_dispatchT,
+                                          std::nullptr_t>) {
+                if (isEqual(simplified_rhs_strides, one_zero_strides) &&
+                    isEqual(simplified_lhs_strides,
+                            {simplified_shape[1], one})) {
+                    auto col_matrix_broadcast_fn =
+                        contig_col_matrix_broadcast_dispatch_table[rhs_typeid]
+                                                                  [lhs_typeid];
+                    if (col_matrix_broadcast_fn != nullptr) {
+                        std::size_t n0 = simplified_shape[0];
+                        std::size_t n1 = simplified_shape[1];
+                        sycl::event comp_ev = col_matrix_broadcast_fn(
+                            exec_q, host_tasks, n0, n1, rhs_data, rhs_offset,
+                            lhs_data, lhs_offset, depends);
+                        return std::make_pair(
+                            dpnp::utils::keep_args_alive(
+                                exec_q, {lhs, rhs}, host_tasks),
+                            comp_ev);
+                    }
+                }
+            }
             // special case of C-contiguous matrix and a row
             if (isEqual(simplified_rhs_strides, one_zero_strides) &&
                 isEqual(simplified_lhs_strides, {one, simplified_shape[0]})) {
diff --git a/dpnp/tests/test_binary_ufuncs.py b/dpnp/tests/test_binary_ufuncs.py
index 2e97047951a8..2b431f15fe19 100644
--- a/dpnp/tests/test_binary_ufuncs.py
+++ b/dpnp/tests/test_binary_ufuncs.py
@@ -121,6 +121,44 @@ def test_inplace_dtype_explicit(self, dtype1, dtype2):
             assert_raises(TypeError, numpy.add, a, b, out=a)
             assert_raises(ValueError, dpnp.add, ia, ib, out=ia)
 
+    @pytest.mark.parametrize(
+        "dtype", get_all_dtypes(no_none=True, no_bool=True)
+    )
+    @pytest.mark.parametrize(
+        "shape", [(4, 4), (3, 5), (100, 37), (513, 256)]
+    )
+    def test_inplace_row_broadcast(self, shape, dtype):
+        # C-contiguous matrix += row vector, in place (broadcast over rows)
+        n0, n1 = shape
+        a = get_abs_array(
+            numpy.arange(n0 * n1).reshape(shape) % 7 + 1, dtype=dtype
+        )
+        row = get_abs_array(numpy.arange(n1) % 3 + 1, dtype=dtype)
+        ia, irow = dpnp.array(a), dpnp.array(row)
+
+        a += row
+        ia += irow
+        assert_dtype_allclose(ia, a)
+
+    @pytest.mark.parametrize(
+        "dtype", get_all_dtypes(no_none=True, no_bool=True)
+    )
+    @pytest.mark.parametrize(
+        "shape", [(4, 4), (3, 5), (100, 37), (513, 256)]
+    )
+    def test_inplace_column_broadcast(self, shape, dtype):
+        # C-contiguous matrix += column vector, in place (broadcast over columns)
+        n0, n1 = shape
+        a = get_abs_array(
+            numpy.arange(n0 * n1).reshape(shape) % 7 + 1, dtype=dtype
+        )
+        col = get_abs_array((numpy.arange(n0) % 3 + 1).reshape(n0, 1), dtype=dtype)
+        ia, icol = dpnp.array(a), dpnp.array(col)
+
+        a += col
+        ia += icol
+        assert_dtype_allclose(ia, a)
+
     @pytest.mark.parametrize("shape", [(0,), (15,), (2, 2)])
     def test_invalid_shape(self, shape):
         a, b = dpnp.arange(10), dpnp.arange(10)

From 43beb19ca975c0e66f653089a930a1dba5d582ea Mon Sep 17 00:00:00 2001
From: vchamarthi <venkata.sai.gireesh.chamarthi@intel.com>
Date: Mon, 29 Jun 2026 20:01:42 -0500
Subject: [PATCH 2/3] Update changelog with PR number #2981

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0a7aa076c3fc..cdab0a6cb04d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,7 +17,7 @@ This release is compatible with NumPy 2.5.
 
 ### Changed
 
-* Improved performance of in-place binary elementwise operations broadcasting a vector against a C-contiguous matrix (e.g. `m += row`, `m += col[:, None]`) on CPU by dispatching to vectorized broadcast kernels instead of the general strided kernel [#XXXX](https://github.com/IntelPython/dpnp/pull/XXXX)
+* Improved performance of in-place binary elementwise operations broadcasting a vector against a C-contiguous matrix (e.g. `m += row`, `m += col[:, None]`) on CPU by dispatching to vectorized broadcast kernels instead of the general strided kernel [#2981](https://github.com/IntelPython/dpnp/pull/2981)
 * Changed `dpnp.meshgrid` and `dpnp.tensor.meshgrid` to return a tuple instead of a list, aligning with NumPy 2.5+ behavior and 2025.12 version of the Python array API standard [#2854](https://github.com/IntelPython/dpnp/pull/2854)
 * Updated `searchsorted` implementations to align with the 2025.12 array API spec [#2902](https://github.com/IntelPython/dpnp/pull/2902)
 * Updated tests to align with NumPy 2.4.5 compatibility [#2920](https://github.com/IntelPython/dpnp/pull/2920)

From 5e1ea14fd8fca4509876fdf69984303515818622 Mon Sep 17 00:00:00 2001
From: vchamarthi <venkata.sai.gireesh.chamarthi@intel.com>
Date: Mon, 29 Jun 2026 20:11:13 -0500
Subject: [PATCH 3/3] Apply pre-commit formatting (black, clang-format)

---
 .../kernels/elementwise_functions/add.hpp     |  6 +--
 .../elementwise_functions/common_inplace.hpp  | 11 ++---
 .../source/elementwise_functions/add.cpp      |  4 +-
 .../elementwise_functions.hpp                 | 43 ++++++++++---------
 dpnp/tests/test_binary_ufuncs.py              | 12 +++---
 5 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/add.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
index 0aa81dc67ed6..a5c20d5ce124 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
@@ -701,9 +701,9 @@ sycl::event add_inplace_col_matrix_broadcast_impl(
 {
     return elementwise_common::binary_inplace_col_matrix_broadcast_impl<
         argT, resT, AddInplaceColMatrixBroadcastingFunctor,
-        add_inplace_col_matrix_broadcast_krn>(exec_q, host_tasks, n0, n1,
-                                              vec_p, vec_offset, mat_p,
-                                              mat_offset, depends);
+        add_inplace_col_matrix_broadcast_krn>(exec_q, host_tasks, n0, n1, vec_p,
+                                              vec_offset, mat_p, mat_offset,
+                                              depends);
 }
 
 template <typename fnT, typename T1, typename T2>
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
index a87d0e0ae86e..43cb2c3e0f0d 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
@@ -473,7 +473,8 @@ sycl::event binary_inplace_row_matrix_broadcast_impl(
 
 // In-place column-broadcast: C-contiguous matrix += column vector.
 // Scalar kernel: one work-item per element, mat[gid] += vec[gid / n1] (n1 = row
-// length so gid/n1 = row index). No sub-group collectives -> correct for any n1.
+// length so gid/n1 = row index). No sub-group collectives -> correct for any
+// n1.
 template <typename argT, typename resT, typename BinaryOperatorT>
 struct BinaryInplaceColMatrixBroadcastingFunctor
 {
@@ -511,8 +512,8 @@ typedef sycl::event (*binary_inplace_col_matrix_broadcast_impl_fn_ptr_t)(
 
 template <typename argT,
           typename resT,
-          template <typename T1, typename T3>
-          class BinaryInplaceColMatrixBroadcastFunctorT,
+          template <typename T1,
+                    typename T3> class BinaryInplaceColMatrixBroadcastFunctorT,
           template <typename T1, typename T3> class kernel_name>
 sycl::event binary_inplace_col_matrix_broadcast_impl(
     sycl::queue &exec_q,
@@ -536,8 +537,8 @@ sycl::event binary_inplace_col_matrix_broadcast_impl(
 
         using Impl = BinaryInplaceColMatrixBroadcastFunctorT<argT, resT>;
 
-        cgh.parallel_for<class kernel_name<argT, resT>>(
-            sycl::range<1>(n_elems), Impl(vec, mat, n1));
+        cgh.parallel_for<class kernel_name<argT, resT>>(sycl::range<1>(n_elems),
+                                                        Impl(vec, mat, n1));
     });
 
     return comp_ev;
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/add.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/add.cpp
index 99e7bd6a92bc..c3ba5c856c3e 100644
--- a/dpnp/tensor/libtensor/source/elementwise_functions/add.cpp
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/add.cpp
@@ -63,9 +63,9 @@ using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
 using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
 using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
 
+using ew_cmn_ns::binary_inplace_col_matrix_broadcast_impl_fn_ptr_t;
 using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
 using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_inplace_col_matrix_broadcast_impl_fn_ptr_t;
 using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
 
 // B01: ===== ADD (x1, x2)
@@ -224,10 +224,10 @@ void init_add(py::module_ m)
               py::arg("depends") = py::list());
         m.def("_add_result_type", add_result_type_pyapi, "");
 
+        using impl::add_inplace_col_matrix_dispatch_table;
         using impl::add_inplace_contig_dispatch_table;
         using impl::add_inplace_output_id_table;
         using impl::add_inplace_row_matrix_dispatch_table;
-        using impl::add_inplace_col_matrix_dispatch_table;
         using impl::add_inplace_strided_dispatch_table;
 
         auto add_inplace_pyapi = [&](const arrayT &src, const arrayT &dst,
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
index 9263d0213891..f995499a2d91 100644
--- a/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
@@ -611,20 +611,19 @@ template <typename output_typesT,
           // Optional table for the C-contiguous matrix += column broadcast
           // case; defaulted so existing callers stay source-compatible.
           typename contig_col_matrix_dispatchT = std::nullptr_t>
-std::pair<sycl::event, sycl::event>
-    py_binary_inplace_ufunc(const dpnp::tensor::usm_ndarray &lhs,
-                            const dpnp::tensor::usm_ndarray &rhs,
-                            sycl::queue &exec_q,
-                            const std::vector<sycl::event> depends,
-                            //
-                            const output_typesT &output_type_table,
-                            const contig_dispatchT &contig_dispatch_table,
-                            const strided_dispatchT &strided_dispatch_table,
-                            const contig_row_matrix_dispatchT
-                                &contig_row_matrix_broadcast_dispatch_table,
-                            const contig_col_matrix_dispatchT
-                                &contig_col_matrix_broadcast_dispatch_table =
-                                    nullptr)
+std::pair<sycl::event, sycl::event> py_binary_inplace_ufunc(
+    const dpnp::tensor::usm_ndarray &lhs,
+    const dpnp::tensor::usm_ndarray &rhs,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> depends,
+    //
+    const output_typesT &output_type_table,
+    const contig_dispatchT &contig_dispatch_table,
+    const strided_dispatchT &strided_dispatch_table,
+    const contig_row_matrix_dispatchT
+        &contig_row_matrix_broadcast_dispatch_table,
+    const contig_col_matrix_dispatchT
+        &contig_col_matrix_broadcast_dispatch_table = nullptr)
 {
     dpnp::tensor::validation::CheckWritable::throw_if_not_writable(lhs);
 
@@ -757,7 +756,8 @@ std::pair<sycl::event, sycl::event>
                 std::initializer_list<py::ssize_t>{1, 0};
             static constexpr py::ssize_t one{1};
             // C-contiguous matrix (lhs) and a row (rhs): D(N0,N1) += row(N1,)
-            // lhs strides {N1,1} = {shape[1],1}, rhs (row broadcast) strides {0,1}
+            // lhs strides {N1,1} = {shape[1],1}, rhs (row broadcast) strides
+            // {0,1}
             if (isEqual(simplified_rhs_strides, zero_one_strides) &&
                 isEqual(simplified_lhs_strides, {simplified_shape[1], one})) {
                 auto row_matrix_broadcast_fn =
@@ -777,14 +777,15 @@ std::pair<sycl::event, sycl::event>
                             lhs_data, lhs_offset, depends);
 
                         return std::make_pair(
-                            dpnp::utils::keep_args_alive(
-                                exec_q, {lhs, rhs}, host_tasks),
+                            dpnp::utils::keep_args_alive(exec_q, {lhs, rhs},
+                                                         host_tasks),
                             comp_ev);
                     }
                 }
             }
-            // C-contiguous matrix (lhs) and a column (rhs): D(N0,N1) += col(N0,1)
-            // rhs(col broadcast) strides {1,0}; lhs(C-contig) {shape[1],1}
+            // C-contiguous matrix (lhs) and a column (rhs): D(N0,N1) +=
+            // col(N0,1) rhs(col broadcast) strides {1,0}; lhs(C-contig)
+            // {shape[1],1}
             if constexpr (!std::is_same_v<contig_col_matrix_dispatchT,
                                           std::nullptr_t>) {
                 if (isEqual(simplified_rhs_strides, one_zero_strides) &&
@@ -800,8 +801,8 @@ std::pair<sycl::event, sycl::event>
                             exec_q, host_tasks, n0, n1, rhs_data, rhs_offset,
                             lhs_data, lhs_offset, depends);
                         return std::make_pair(
-                            dpnp::utils::keep_args_alive(
-                                exec_q, {lhs, rhs}, host_tasks),
+                            dpnp::utils::keep_args_alive(exec_q, {lhs, rhs},
+                                                         host_tasks),
                             comp_ev);
                     }
                 }
diff --git a/dpnp/tests/test_binary_ufuncs.py b/dpnp/tests/test_binary_ufuncs.py
index 2b431f15fe19..f71da9b12572 100644
--- a/dpnp/tests/test_binary_ufuncs.py
+++ b/dpnp/tests/test_binary_ufuncs.py
@@ -124,9 +124,7 @@ def test_inplace_dtype_explicit(self, dtype1, dtype2):
     @pytest.mark.parametrize(
         "dtype", get_all_dtypes(no_none=True, no_bool=True)
     )
-    @pytest.mark.parametrize(
-        "shape", [(4, 4), (3, 5), (100, 37), (513, 256)]
-    )
+    @pytest.mark.parametrize("shape", [(4, 4), (3, 5), (100, 37), (513, 256)])
     def test_inplace_row_broadcast(self, shape, dtype):
         # C-contiguous matrix += row vector, in place (broadcast over rows)
         n0, n1 = shape
@@ -143,16 +141,16 @@ def test_inplace_row_broadcast(self, shape, dtype):
     @pytest.mark.parametrize(
         "dtype", get_all_dtypes(no_none=True, no_bool=True)
     )
-    @pytest.mark.parametrize(
-        "shape", [(4, 4), (3, 5), (100, 37), (513, 256)]
-    )
+    @pytest.mark.parametrize("shape", [(4, 4), (3, 5), (100, 37), (513, 256)])
     def test_inplace_column_broadcast(self, shape, dtype):
         # C-contiguous matrix += column vector, in place (broadcast over columns)
         n0, n1 = shape
         a = get_abs_array(
             numpy.arange(n0 * n1).reshape(shape) % 7 + 1, dtype=dtype
         )
-        col = get_abs_array((numpy.arange(n0) % 3 + 1).reshape(n0, 1), dtype=dtype)
+        col = get_abs_array(
+            (numpy.arange(n0) % 3 + 1).reshape(n0, 1), dtype=dtype
+        )
         ia, icol = dpnp.array(a), dpnp.array(col)
 
         a += col