From 31b53c08f86b56b4762ab16a118db9d61ed488df Mon Sep 17 00:00:00 2001 From: vchamarthi Date: Mon, 29 Jun 2026 19:40:13 -0500 Subject: [PATCH 1/3] Speed up in-place vector-to-C-contiguous-matrix broadcast on CPU In-place binary elementwise ops broadcasting a vector against a C-contiguous matrix (m += row, m += col[:, None]) fell through to the general strided kernel on CPU, although a vectorized row-broadcast kernel already exists and is used by the out-of-place path. - Add the missing C-contiguous row-broadcast dispatch branch to py_binary_inplace_ufunc (reuses the existing BinaryInplaceRowMatrixBroadcastingFunctor); the in-place template previously only had the F-style {1,0} branch while the out-of-place path already handled the {0,1} C-contiguous case. - Add BinaryInplaceColMatrixBroadcastingFunctor for the column case (mat[gid] += vec[gid / n1]) and wire it for add via a defaulted extra template parameter, keeping all other in-place ufuncs unchanged. Both paths are guarded by exact simplified-stride checks and fall back to the strided kernel otherwise. Results are bitwise-identical. Adds TestAdd::test_inplace_row_broadcast and TestAdd::test_inplace_column_broadcast covering several shapes (incl. row lengths not a multiple of the sub-group size) across dtypes. --- CHANGELOG.md | 1 + .../kernels/elementwise_functions/add.hpp | 53 ++++++++++++++ .../elementwise_functions/common_inplace.hpp | 72 +++++++++++++++++++ .../source/elementwise_functions/add.cpp | 18 ++++- .../elementwise_functions.hpp | 62 +++++++++++++++- dpnp/tests/test_binary_ufuncs.py | 38 ++++++++++ 6 files changed, 241 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 27c8329caa19..0a7aa076c3fc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ This release is compatible with NumPy 2.5. ### Changed +* Improved performance of in-place binary elementwise operations broadcasting a vector against a C-contiguous matrix (e.g. `m += row`, `m += col[:, None]`) on CPU by dispatching to vectorized broadcast kernels instead of the general strided kernel [#XXXX](https://github.com/IntelPython/dpnp/pull/XXXX) * Changed `dpnp.meshgrid` and `dpnp.tensor.meshgrid` to return a tuple instead of a list, aligning with NumPy 2.5+ behavior and 2025.12 version of the Python array API standard [#2854](https://github.com/IntelPython/dpnp/pull/2854) * Updated `searchsorted` implementations to align with the 2025.12 array API spec [#2902](https://github.com/IntelPython/dpnp/pull/2902) * Updated tests to align with NumPy 2.4.5 compatibility [#2920](https://github.com/IntelPython/dpnp/pull/2920) diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/add.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/add.hpp index a4d5b4252e99..0aa81dc67ed6 100644 --- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/add.hpp +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/add.hpp @@ -676,4 +676,57 @@ struct AddInplaceRowMatrixBroadcastFactory } }; +// In-place column-broadcast wiring for add. +template +class add_inplace_col_matrix_broadcast_krn; + +template +using AddInplaceColMatrixBroadcastingFunctor = + elementwise_common::BinaryInplaceColMatrixBroadcastingFunctor< + argT, + resT, + AddInplaceFunctor>; + +template +sycl::event add_inplace_col_matrix_broadcast_impl( + sycl::queue &exec_q, + std::vector &host_tasks, + std::size_t n0, + std::size_t n1, + const char *vec_p, + ssize_t vec_offset, + char *mat_p, + ssize_t mat_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_inplace_col_matrix_broadcast_impl< + argT, resT, AddInplaceColMatrixBroadcastingFunctor, + add_inplace_col_matrix_broadcast_krn>(exec_q, host_tasks, n0, n1, + vec_p, vec_offset, mat_p, + mat_offset, depends); +} + +template +struct AddInplaceColMatrixBroadcastFactory +{ + fnT get() + { + if constexpr (!AddInplaceTypePairSupport::is_defined) { + fnT fn = nullptr; + return fn; + } + else { + if constexpr (dpnp::tensor::type_utils::is_complex::value || + dpnp::tensor::type_utils::is_complex::value) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = add_inplace_col_matrix_broadcast_impl; + return fn; + } + } + } +}; + } // namespace dpnp::tensor::kernels::add diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp index 9384ec603754..a87d0e0ae86e 100644 --- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp @@ -471,4 +471,76 @@ sycl::event binary_inplace_row_matrix_broadcast_impl( return comp_ev; } +// In-place column-broadcast: C-contiguous matrix += column vector. +// Scalar kernel: one work-item per element, mat[gid] += vec[gid / n1] (n1 = row +// length so gid/n1 = row index). No sub-group collectives -> correct for any n1. +template +struct BinaryInplaceColMatrixBroadcastingFunctor +{ +private: + const argT *vec; // (n0,) contiguous column vector, one scalar per row + resT *mat; // (n0, n1) C-contiguous matrix + std::size_t n1; // contiguous row length + +public: + BinaryInplaceColMatrixBroadcastingFunctor(const argT *col_tp, + resT *mat_tp, + std::size_t n_elems_in_row) + : vec(col_tp), mat(mat_tp), n1(n_elems_in_row) + { + } + + void operator()(sycl::id<1> wid) const + { + BinaryOperatorT op{}; + const std::size_t gid = wid.get(0); + op(mat[gid], vec[gid / n1]); + } +}; + +typedef sycl::event (*binary_inplace_col_matrix_broadcast_impl_fn_ptr_t)( + sycl::queue &, + std::vector &, + std::size_t, + std::size_t, + const char *, + ssize_t, + char *, + ssize_t, + const std::vector &); + +template + class BinaryInplaceColMatrixBroadcastFunctorT, + template class kernel_name> +sycl::event binary_inplace_col_matrix_broadcast_impl( + sycl::queue &exec_q, + std::vector &host_tasks, + std::size_t n0, + std::size_t n1, + const char *vec_p, + ssize_t vec_offset, + char *mat_p, + ssize_t mat_offset, + const std::vector &depends = {}) +{ + const argT *vec = reinterpret_cast(vec_p) + vec_offset; + resT *mat = reinterpret_cast(mat_p) + mat_offset; + (void)host_tasks; // no padding/temporary needed for column broadcast + + const std::size_t n_elems = n0 * n1; + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using Impl = BinaryInplaceColMatrixBroadcastFunctorT; + + cgh.parallel_for>( + sycl::range<1>(n_elems), Impl(vec, mat, n1)); + }); + + return comp_ev; +} + } // namespace dpnp::tensor::kernels::elementwise_common diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/add.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/add.cpp index 7ba2fd8a37f3..99e7bd6a92bc 100644 --- a/dpnp/tensor/libtensor/source/elementwise_functions/add.cpp +++ b/dpnp/tensor/libtensor/source/elementwise_functions/add.cpp @@ -65,6 +65,7 @@ using ew_cmn_ns::binary_strided_impl_fn_ptr_t; using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_col_matrix_broadcast_impl_fn_ptr_t; using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; // B01: ===== ADD (x1, x2) @@ -98,6 +99,8 @@ static binary_inplace_strided_impl_fn_ptr_t add_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t add_inplace_row_matrix_dispatch_table[td_ns::num_types][td_ns::num_types]; +static binary_inplace_col_matrix_broadcast_impl_fn_ptr_t + add_inplace_col_matrix_dispatch_table[td_ns::num_types][td_ns::num_types]; void populate_add_dispatch_tables(void) { @@ -165,6 +168,14 @@ void populate_add_dispatch_tables(void) dtb8; dtb8.populate_dispatch_table(add_inplace_row_matrix_dispatch_table); + // function pointers for the in-place c-contig matrix += column + // broadcast operation + using fn_ns::AddInplaceColMatrixBroadcastFactory; + DispatchTableBuilder + dtb10; + dtb10.populate_dispatch_table(add_inplace_col_matrix_dispatch_table); + // which types are supported by the in-place kernels using fn_ns::AddInplaceTypeMapFactory; DispatchTableBuilder dtb9; @@ -216,6 +227,7 @@ void init_add(py::module_ m) using impl::add_inplace_contig_dispatch_table; using impl::add_inplace_output_id_table; using impl::add_inplace_row_matrix_dispatch_table; + using impl::add_inplace_col_matrix_dispatch_table; using impl::add_inplace_strided_dispatch_table; auto add_inplace_pyapi = [&](const arrayT &src, const arrayT &dst, @@ -232,7 +244,11 @@ void init_add(py::module_ m) // function pointers to handle inplace operation on // c-contig matrix with c-contig row with broadcasting // (may be nullptr) - add_inplace_row_matrix_dispatch_table); + add_inplace_row_matrix_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig column with broadcasting + // (may be nullptr) + add_inplace_col_matrix_dispatch_table); }; m.def("_add_inplace", add_inplace_pyapi, "", py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"), diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp index a3924c8d28a6..9263d0213891 100644 --- a/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp +++ b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp @@ -607,7 +607,10 @@ py::object py_binary_ufunc_result_type(const py::dtype &input1_dtype, template + typename contig_row_matrix_dispatchT, + // Optional table for the C-contiguous matrix += column broadcast + // case; defaulted so existing callers stay source-compatible. + typename contig_col_matrix_dispatchT = std::nullptr_t> std::pair py_binary_inplace_ufunc(const dpnp::tensor::usm_ndarray &lhs, const dpnp::tensor::usm_ndarray &rhs, @@ -618,7 +621,10 @@ std::pair const contig_dispatchT &contig_dispatch_table, const strided_dispatchT &strided_dispatch_table, const contig_row_matrix_dispatchT - &contig_row_matrix_broadcast_dispatch_table) + &contig_row_matrix_broadcast_dispatch_table, + const contig_col_matrix_dispatchT + &contig_col_matrix_broadcast_dispatch_table = + nullptr) { dpnp::tensor::validation::CheckWritable::throw_if_not_writable(lhs); @@ -745,9 +751,61 @@ std::pair } } if (nd == 2) { + static constexpr auto zero_one_strides = + std::initializer_list{0, 1}; static constexpr auto one_zero_strides = std::initializer_list{1, 0}; static constexpr py::ssize_t one{1}; + // C-contiguous matrix (lhs) and a row (rhs): D(N0,N1) += row(N1,) + // lhs strides {N1,1} = {shape[1],1}, rhs (row broadcast) strides {0,1} + if (isEqual(simplified_rhs_strides, zero_one_strides) && + isEqual(simplified_lhs_strides, {simplified_shape[1], one})) { + auto row_matrix_broadcast_fn = + contig_row_matrix_broadcast_dispatch_table[rhs_typeid] + [lhs_typeid]; + if (row_matrix_broadcast_fn != nullptr) { + int rhs_itemsize = rhs.get_elemsize(); + int lhs_itemsize = lhs.get_elemsize(); + if (is_aligned( + rhs_data + rhs_offset * rhs_itemsize) && + is_aligned( + lhs_data + lhs_offset * lhs_itemsize)) { + std::size_t n0 = simplified_shape[0]; + std::size_t n1 = simplified_shape[1]; + sycl::event comp_ev = row_matrix_broadcast_fn( + exec_q, host_tasks, n0, n1, rhs_data, rhs_offset, + lhs_data, lhs_offset, depends); + + return std::make_pair( + dpnp::utils::keep_args_alive( + exec_q, {lhs, rhs}, host_tasks), + comp_ev); + } + } + } + // C-contiguous matrix (lhs) and a column (rhs): D(N0,N1) += col(N0,1) + // rhs(col broadcast) strides {1,0}; lhs(C-contig) {shape[1],1} + if constexpr (!std::is_same_v) { + if (isEqual(simplified_rhs_strides, one_zero_strides) && + isEqual(simplified_lhs_strides, + {simplified_shape[1], one})) { + auto col_matrix_broadcast_fn = + contig_col_matrix_broadcast_dispatch_table[rhs_typeid] + [lhs_typeid]; + if (col_matrix_broadcast_fn != nullptr) { + std::size_t n0 = simplified_shape[0]; + std::size_t n1 = simplified_shape[1]; + sycl::event comp_ev = col_matrix_broadcast_fn( + exec_q, host_tasks, n0, n1, rhs_data, rhs_offset, + lhs_data, lhs_offset, depends); + return std::make_pair( + dpnp::utils::keep_args_alive( + exec_q, {lhs, rhs}, host_tasks), + comp_ev); + } + } + } // special case of C-contiguous matrix and a row if (isEqual(simplified_rhs_strides, one_zero_strides) && isEqual(simplified_lhs_strides, {one, simplified_shape[0]})) { diff --git a/dpnp/tests/test_binary_ufuncs.py b/dpnp/tests/test_binary_ufuncs.py index 2e97047951a8..2b431f15fe19 100644 --- a/dpnp/tests/test_binary_ufuncs.py +++ b/dpnp/tests/test_binary_ufuncs.py @@ -121,6 +121,44 @@ def test_inplace_dtype_explicit(self, dtype1, dtype2): assert_raises(TypeError, numpy.add, a, b, out=a) assert_raises(ValueError, dpnp.add, ia, ib, out=ia) + @pytest.mark.parametrize( + "dtype", get_all_dtypes(no_none=True, no_bool=True) + ) + @pytest.mark.parametrize( + "shape", [(4, 4), (3, 5), (100, 37), (513, 256)] + ) + def test_inplace_row_broadcast(self, shape, dtype): + # C-contiguous matrix += row vector, in place (broadcast over rows) + n0, n1 = shape + a = get_abs_array( + numpy.arange(n0 * n1).reshape(shape) % 7 + 1, dtype=dtype + ) + row = get_abs_array(numpy.arange(n1) % 3 + 1, dtype=dtype) + ia, irow = dpnp.array(a), dpnp.array(row) + + a += row + ia += irow + assert_dtype_allclose(ia, a) + + @pytest.mark.parametrize( + "dtype", get_all_dtypes(no_none=True, no_bool=True) + ) + @pytest.mark.parametrize( + "shape", [(4, 4), (3, 5), (100, 37), (513, 256)] + ) + def test_inplace_column_broadcast(self, shape, dtype): + # C-contiguous matrix += column vector, in place (broadcast over columns) + n0, n1 = shape + a = get_abs_array( + numpy.arange(n0 * n1).reshape(shape) % 7 + 1, dtype=dtype + ) + col = get_abs_array((numpy.arange(n0) % 3 + 1).reshape(n0, 1), dtype=dtype) + ia, icol = dpnp.array(a), dpnp.array(col) + + a += col + ia += icol + assert_dtype_allclose(ia, a) + @pytest.mark.parametrize("shape", [(0,), (15,), (2, 2)]) def test_invalid_shape(self, shape): a, b = dpnp.arange(10), dpnp.arange(10) From 43beb19ca975c0e66f653089a930a1dba5d582ea Mon Sep 17 00:00:00 2001 From: vchamarthi Date: Mon, 29 Jun 2026 20:01:42 -0500 Subject: [PATCH 2/3] Update changelog with PR number #2981 --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0a7aa076c3fc..cdab0a6cb04d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,7 @@ This release is compatible with NumPy 2.5. ### Changed -* Improved performance of in-place binary elementwise operations broadcasting a vector against a C-contiguous matrix (e.g. `m += row`, `m += col[:, None]`) on CPU by dispatching to vectorized broadcast kernels instead of the general strided kernel [#XXXX](https://github.com/IntelPython/dpnp/pull/XXXX) +* Improved performance of in-place binary elementwise operations broadcasting a vector against a C-contiguous matrix (e.g. `m += row`, `m += col[:, None]`) on CPU by dispatching to vectorized broadcast kernels instead of the general strided kernel [#2981](https://github.com/IntelPython/dpnp/pull/2981) * Changed `dpnp.meshgrid` and `dpnp.tensor.meshgrid` to return a tuple instead of a list, aligning with NumPy 2.5+ behavior and 2025.12 version of the Python array API standard [#2854](https://github.com/IntelPython/dpnp/pull/2854) * Updated `searchsorted` implementations to align with the 2025.12 array API spec [#2902](https://github.com/IntelPython/dpnp/pull/2902) * Updated tests to align with NumPy 2.4.5 compatibility [#2920](https://github.com/IntelPython/dpnp/pull/2920) From 5e1ea14fd8fca4509876fdf69984303515818622 Mon Sep 17 00:00:00 2001 From: vchamarthi Date: Mon, 29 Jun 2026 20:11:13 -0500 Subject: [PATCH 3/3] Apply pre-commit formatting (black, clang-format) --- .../kernels/elementwise_functions/add.hpp | 6 +-- .../elementwise_functions/common_inplace.hpp | 11 ++--- .../source/elementwise_functions/add.cpp | 4 +- .../elementwise_functions.hpp | 43 ++++++++++--------- dpnp/tests/test_binary_ufuncs.py | 12 +++--- 5 files changed, 38 insertions(+), 38 deletions(-) diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/add.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/add.hpp index 0aa81dc67ed6..a5c20d5ce124 100644 --- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/add.hpp +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/add.hpp @@ -701,9 +701,9 @@ sycl::event add_inplace_col_matrix_broadcast_impl( { return elementwise_common::binary_inplace_col_matrix_broadcast_impl< argT, resT, AddInplaceColMatrixBroadcastingFunctor, - add_inplace_col_matrix_broadcast_krn>(exec_q, host_tasks, n0, n1, - vec_p, vec_offset, mat_p, - mat_offset, depends); + add_inplace_col_matrix_broadcast_krn>(exec_q, host_tasks, n0, n1, vec_p, + vec_offset, mat_p, mat_offset, + depends); } template diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp index a87d0e0ae86e..43cb2c3e0f0d 100644 --- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp +++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp @@ -473,7 +473,8 @@ sycl::event binary_inplace_row_matrix_broadcast_impl( // In-place column-broadcast: C-contiguous matrix += column vector. // Scalar kernel: one work-item per element, mat[gid] += vec[gid / n1] (n1 = row -// length so gid/n1 = row index). No sub-group collectives -> correct for any n1. +// length so gid/n1 = row index). No sub-group collectives -> correct for any +// n1. template struct BinaryInplaceColMatrixBroadcastingFunctor { @@ -511,8 +512,8 @@ typedef sycl::event (*binary_inplace_col_matrix_broadcast_impl_fn_ptr_t)( template - class BinaryInplaceColMatrixBroadcastFunctorT, + template class BinaryInplaceColMatrixBroadcastFunctorT, template class kernel_name> sycl::event binary_inplace_col_matrix_broadcast_impl( sycl::queue &exec_q, @@ -536,8 +537,8 @@ sycl::event binary_inplace_col_matrix_broadcast_impl( using Impl = BinaryInplaceColMatrixBroadcastFunctorT; - cgh.parallel_for>( - sycl::range<1>(n_elems), Impl(vec, mat, n1)); + cgh.parallel_for>(sycl::range<1>(n_elems), + Impl(vec, mat, n1)); }); return comp_ev; diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/add.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/add.cpp index 99e7bd6a92bc..c3ba5c856c3e 100644 --- a/dpnp/tensor/libtensor/source/elementwise_functions/add.cpp +++ b/dpnp/tensor/libtensor/source/elementwise_functions/add.cpp @@ -63,9 +63,9 @@ using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; using ew_cmn_ns::binary_strided_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_col_matrix_broadcast_impl_fn_ptr_t; using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; -using ew_cmn_ns::binary_inplace_col_matrix_broadcast_impl_fn_ptr_t; using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; // B01: ===== ADD (x1, x2) @@ -224,10 +224,10 @@ void init_add(py::module_ m) py::arg("depends") = py::list()); m.def("_add_result_type", add_result_type_pyapi, ""); + using impl::add_inplace_col_matrix_dispatch_table; using impl::add_inplace_contig_dispatch_table; using impl::add_inplace_output_id_table; using impl::add_inplace_row_matrix_dispatch_table; - using impl::add_inplace_col_matrix_dispatch_table; using impl::add_inplace_strided_dispatch_table; auto add_inplace_pyapi = [&](const arrayT &src, const arrayT &dst, diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp index 9263d0213891..f995499a2d91 100644 --- a/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp +++ b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp @@ -611,20 +611,19 @@ template -std::pair - py_binary_inplace_ufunc(const dpnp::tensor::usm_ndarray &lhs, - const dpnp::tensor::usm_ndarray &rhs, - sycl::queue &exec_q, - const std::vector depends, - // - const output_typesT &output_type_table, - const contig_dispatchT &contig_dispatch_table, - const strided_dispatchT &strided_dispatch_table, - const contig_row_matrix_dispatchT - &contig_row_matrix_broadcast_dispatch_table, - const contig_col_matrix_dispatchT - &contig_col_matrix_broadcast_dispatch_table = - nullptr) +std::pair py_binary_inplace_ufunc( + const dpnp::tensor::usm_ndarray &lhs, + const dpnp::tensor::usm_ndarray &rhs, + sycl::queue &exec_q, + const std::vector depends, + // + const output_typesT &output_type_table, + const contig_dispatchT &contig_dispatch_table, + const strided_dispatchT &strided_dispatch_table, + const contig_row_matrix_dispatchT + &contig_row_matrix_broadcast_dispatch_table, + const contig_col_matrix_dispatchT + &contig_col_matrix_broadcast_dispatch_table = nullptr) { dpnp::tensor::validation::CheckWritable::throw_if_not_writable(lhs); @@ -757,7 +756,8 @@ std::pair std::initializer_list{1, 0}; static constexpr py::ssize_t one{1}; // C-contiguous matrix (lhs) and a row (rhs): D(N0,N1) += row(N1,) - // lhs strides {N1,1} = {shape[1],1}, rhs (row broadcast) strides {0,1} + // lhs strides {N1,1} = {shape[1],1}, rhs (row broadcast) strides + // {0,1} if (isEqual(simplified_rhs_strides, zero_one_strides) && isEqual(simplified_lhs_strides, {simplified_shape[1], one})) { auto row_matrix_broadcast_fn = @@ -777,14 +777,15 @@ std::pair lhs_data, lhs_offset, depends); return std::make_pair( - dpnp::utils::keep_args_alive( - exec_q, {lhs, rhs}, host_tasks), + dpnp::utils::keep_args_alive(exec_q, {lhs, rhs}, + host_tasks), comp_ev); } } } - // C-contiguous matrix (lhs) and a column (rhs): D(N0,N1) += col(N0,1) - // rhs(col broadcast) strides {1,0}; lhs(C-contig) {shape[1],1} + // C-contiguous matrix (lhs) and a column (rhs): D(N0,N1) += + // col(N0,1) rhs(col broadcast) strides {1,0}; lhs(C-contig) + // {shape[1],1} if constexpr (!std::is_same_v) { if (isEqual(simplified_rhs_strides, one_zero_strides) && @@ -800,8 +801,8 @@ std::pair exec_q, host_tasks, n0, n1, rhs_data, rhs_offset, lhs_data, lhs_offset, depends); return std::make_pair( - dpnp::utils::keep_args_alive( - exec_q, {lhs, rhs}, host_tasks), + dpnp::utils::keep_args_alive(exec_q, {lhs, rhs}, + host_tasks), comp_ev); } } diff --git a/dpnp/tests/test_binary_ufuncs.py b/dpnp/tests/test_binary_ufuncs.py index 2b431f15fe19..f71da9b12572 100644 --- a/dpnp/tests/test_binary_ufuncs.py +++ b/dpnp/tests/test_binary_ufuncs.py @@ -124,9 +124,7 @@ def test_inplace_dtype_explicit(self, dtype1, dtype2): @pytest.mark.parametrize( "dtype", get_all_dtypes(no_none=True, no_bool=True) ) - @pytest.mark.parametrize( - "shape", [(4, 4), (3, 5), (100, 37), (513, 256)] - ) + @pytest.mark.parametrize("shape", [(4, 4), (3, 5), (100, 37), (513, 256)]) def test_inplace_row_broadcast(self, shape, dtype): # C-contiguous matrix += row vector, in place (broadcast over rows) n0, n1 = shape @@ -143,16 +141,16 @@ def test_inplace_row_broadcast(self, shape, dtype): @pytest.mark.parametrize( "dtype", get_all_dtypes(no_none=True, no_bool=True) ) - @pytest.mark.parametrize( - "shape", [(4, 4), (3, 5), (100, 37), (513, 256)] - ) + @pytest.mark.parametrize("shape", [(4, 4), (3, 5), (100, 37), (513, 256)]) def test_inplace_column_broadcast(self, shape, dtype): # C-contiguous matrix += column vector, in place (broadcast over columns) n0, n1 = shape a = get_abs_array( numpy.arange(n0 * n1).reshape(shape) % 7 + 1, dtype=dtype ) - col = get_abs_array((numpy.arange(n0) % 3 + 1).reshape(n0, 1), dtype=dtype) + col = get_abs_array( + (numpy.arange(n0) % 3 + 1).reshape(n0, 1), dtype=dtype + ) ia, icol = dpnp.array(a), dpnp.array(col) a += col