Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ This release is compatible with NumPy 2.5.

### Changed

* Improved performance of in-place binary elementwise operations broadcasting a vector against a C-contiguous matrix (e.g. `m += row`, `m += col[:, None]`) on CPU by dispatching to vectorized broadcast kernels instead of the general strided kernel [#2981](https://github.com/IntelPython/dpnp/pull/2981)
* Changed `dpnp.meshgrid` and `dpnp.tensor.meshgrid` to return a tuple instead of a list, aligning with NumPy 2.5+ behavior and 2025.12 version of the Python array API standard [#2854](https://github.com/IntelPython/dpnp/pull/2854)
* Updated `searchsorted` implementations to align with the 2025.12 array API spec [#2902](https://github.com/IntelPython/dpnp/pull/2902)
* Updated tests to align with NumPy 2.4.5 compatibility [#2920](https://github.com/IntelPython/dpnp/pull/2920)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -676,4 +676,57 @@ struct AddInplaceRowMatrixBroadcastFactory
}
};

// In-place column-broadcast wiring for add.
template <typename argT, typename resT>
class add_inplace_col_matrix_broadcast_krn;

template <typename argT, typename resT>
using AddInplaceColMatrixBroadcastingFunctor =
elementwise_common::BinaryInplaceColMatrixBroadcastingFunctor<
argT,
resT,
AddInplaceFunctor<argT, resT>>;

template <typename argT, typename resT>
sycl::event add_inplace_col_matrix_broadcast_impl(
sycl::queue &exec_q,
std::vector<sycl::event> &host_tasks,
std::size_t n0,
std::size_t n1,
const char *vec_p,
ssize_t vec_offset,
char *mat_p,
ssize_t mat_offset,
const std::vector<sycl::event> &depends = {})
{
return elementwise_common::binary_inplace_col_matrix_broadcast_impl<
argT, resT, AddInplaceColMatrixBroadcastingFunctor,
add_inplace_col_matrix_broadcast_krn>(exec_q, host_tasks, n0, n1, vec_p,
vec_offset, mat_p, mat_offset,
depends);
}

template <typename fnT, typename T1, typename T2>
struct AddInplaceColMatrixBroadcastFactory
{
fnT get()
{
if constexpr (!AddInplaceTypePairSupport<T1, T2>::is_defined) {
fnT fn = nullptr;
return fn;
}
else {
if constexpr (dpnp::tensor::type_utils::is_complex<T1>::value ||
dpnp::tensor::type_utils::is_complex<T2>::value) {
fnT fn = nullptr;
return fn;
}
else {
fnT fn = add_inplace_col_matrix_broadcast_impl<T1, T2>;
return fn;
}
}
}
};

} // namespace dpnp::tensor::kernels::add
Original file line number Diff line number Diff line change
Expand Up @@ -471,4 +471,77 @@ sycl::event binary_inplace_row_matrix_broadcast_impl(
return comp_ev;
}

// In-place column-broadcast: C-contiguous matrix += column vector.
// Scalar kernel: one work-item per element, mat[gid] += vec[gid / n1] (n1 = row
// length so gid/n1 = row index). No sub-group collectives -> correct for any
// n1.
template <typename argT, typename resT, typename BinaryOperatorT>
struct BinaryInplaceColMatrixBroadcastingFunctor
{
private:
const argT *vec; // (n0,) contiguous column vector, one scalar per row
resT *mat; // (n0, n1) C-contiguous matrix
std::size_t n1; // contiguous row length

public:
BinaryInplaceColMatrixBroadcastingFunctor(const argT *col_tp,
resT *mat_tp,
std::size_t n_elems_in_row)
: vec(col_tp), mat(mat_tp), n1(n_elems_in_row)
{
}

void operator()(sycl::id<1> wid) const
{
BinaryOperatorT op{};
const std::size_t gid = wid.get(0);
op(mat[gid], vec[gid / n1]);
}
};

typedef sycl::event (*binary_inplace_col_matrix_broadcast_impl_fn_ptr_t)(
sycl::queue &,
std::vector<sycl::event> &,
std::size_t,
std::size_t,
const char *,
ssize_t,
char *,
ssize_t,
const std::vector<sycl::event> &);

template <typename argT,
typename resT,
template <typename T1,
typename T3> class BinaryInplaceColMatrixBroadcastFunctorT,
template <typename T1, typename T3> class kernel_name>
sycl::event binary_inplace_col_matrix_broadcast_impl(
sycl::queue &exec_q,
std::vector<sycl::event> &host_tasks,
std::size_t n0,
std::size_t n1,
const char *vec_p,
ssize_t vec_offset,
char *mat_p,
ssize_t mat_offset,
const std::vector<sycl::event> &depends = {})
{
const argT *vec = reinterpret_cast<const argT *>(vec_p) + vec_offset;
resT *mat = reinterpret_cast<resT *>(mat_p) + mat_offset;
(void)host_tasks; // no padding/temporary needed for column broadcast

const std::size_t n_elems = n0 * n1;

sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(depends);

using Impl = BinaryInplaceColMatrixBroadcastFunctorT<argT, resT>;

cgh.parallel_for<class kernel_name<argT, resT>>(sycl::range<1>(n_elems),
Impl(vec, mat, n1));
});

return comp_ev;
}

} // namespace dpnp::tensor::kernels::elementwise_common
18 changes: 17 additions & 1 deletion dpnp/tensor/libtensor/source/elementwise_functions/add.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
using ew_cmn_ns::binary_strided_impl_fn_ptr_t;

using ew_cmn_ns::binary_inplace_col_matrix_broadcast_impl_fn_ptr_t;
using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
Expand Down Expand Up @@ -98,6 +99,8 @@ static binary_inplace_strided_impl_fn_ptr_t
add_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t
add_inplace_row_matrix_dispatch_table[td_ns::num_types][td_ns::num_types];
static binary_inplace_col_matrix_broadcast_impl_fn_ptr_t
add_inplace_col_matrix_dispatch_table[td_ns::num_types][td_ns::num_types];

void populate_add_dispatch_tables(void)
{
Expand Down Expand Up @@ -165,6 +168,14 @@ void populate_add_dispatch_tables(void)
dtb8;
dtb8.populate_dispatch_table(add_inplace_row_matrix_dispatch_table);

// function pointers for the in-place c-contig matrix += column
// broadcast operation
using fn_ns::AddInplaceColMatrixBroadcastFactory;
DispatchTableBuilder<binary_inplace_col_matrix_broadcast_impl_fn_ptr_t,
AddInplaceColMatrixBroadcastFactory, num_types>
dtb10;
dtb10.populate_dispatch_table(add_inplace_col_matrix_dispatch_table);

// which types are supported by the in-place kernels
using fn_ns::AddInplaceTypeMapFactory;
DispatchTableBuilder<int, AddInplaceTypeMapFactory, num_types> dtb9;
Expand Down Expand Up @@ -213,6 +224,7 @@ void init_add(py::module_ m)
py::arg("depends") = py::list());
m.def("_add_result_type", add_result_type_pyapi, "");

using impl::add_inplace_col_matrix_dispatch_table;
using impl::add_inplace_contig_dispatch_table;
using impl::add_inplace_output_id_table;
using impl::add_inplace_row_matrix_dispatch_table;
Expand All @@ -232,7 +244,11 @@ void init_add(py::module_ m)
// function pointers to handle inplace operation on
// c-contig matrix with c-contig row with broadcasting
// (may be nullptr)
add_inplace_row_matrix_dispatch_table);
add_inplace_row_matrix_dispatch_table,
// function pointers to handle inplace operation on
// c-contig matrix with c-contig column with broadcasting
// (may be nullptr)
add_inplace_col_matrix_dispatch_table);
};
m.def("_add_inplace", add_inplace_pyapi, "", py::arg("lhs"),
py::arg("rhs"), py::arg("sycl_queue"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -607,18 +607,23 @@ py::object py_binary_ufunc_result_type(const py::dtype &input1_dtype,
template <typename output_typesT,
typename contig_dispatchT,
typename strided_dispatchT,
typename contig_row_matrix_dispatchT>
std::pair<sycl::event, sycl::event>
py_binary_inplace_ufunc(const dpnp::tensor::usm_ndarray &lhs,
const dpnp::tensor::usm_ndarray &rhs,
sycl::queue &exec_q,
const std::vector<sycl::event> depends,
//
const output_typesT &output_type_table,
const contig_dispatchT &contig_dispatch_table,
const strided_dispatchT &strided_dispatch_table,
const contig_row_matrix_dispatchT
&contig_row_matrix_broadcast_dispatch_table)
typename contig_row_matrix_dispatchT,
// Optional table for the C-contiguous matrix += column broadcast
// case; defaulted so existing callers stay source-compatible.
typename contig_col_matrix_dispatchT = std::nullptr_t>
std::pair<sycl::event, sycl::event> py_binary_inplace_ufunc(
const dpnp::tensor::usm_ndarray &lhs,
const dpnp::tensor::usm_ndarray &rhs,
sycl::queue &exec_q,
const std::vector<sycl::event> depends,
//
const output_typesT &output_type_table,
const contig_dispatchT &contig_dispatch_table,
const strided_dispatchT &strided_dispatch_table,
const contig_row_matrix_dispatchT
&contig_row_matrix_broadcast_dispatch_table,
const contig_col_matrix_dispatchT
&contig_col_matrix_broadcast_dispatch_table = nullptr)
{
dpnp::tensor::validation::CheckWritable::throw_if_not_writable(lhs);

Expand Down Expand Up @@ -745,9 +750,63 @@ std::pair<sycl::event, sycl::event>
}
}
if (nd == 2) {
static constexpr auto zero_one_strides =
std::initializer_list<py::ssize_t>{0, 1};
static constexpr auto one_zero_strides =
std::initializer_list<py::ssize_t>{1, 0};
static constexpr py::ssize_t one{1};
// C-contiguous matrix (lhs) and a row (rhs): D(N0,N1) += row(N1,)
// lhs strides {N1,1} = {shape[1],1}, rhs (row broadcast) strides
// {0,1}
if (isEqual(simplified_rhs_strides, zero_one_strides) &&
isEqual(simplified_lhs_strides, {simplified_shape[1], one})) {
auto row_matrix_broadcast_fn =
contig_row_matrix_broadcast_dispatch_table[rhs_typeid]
[lhs_typeid];
if (row_matrix_broadcast_fn != nullptr) {
int rhs_itemsize = rhs.get_elemsize();
int lhs_itemsize = lhs.get_elemsize();
if (is_aligned<required_alignment>(
rhs_data + rhs_offset * rhs_itemsize) &&
is_aligned<required_alignment>(
lhs_data + lhs_offset * lhs_itemsize)) {
std::size_t n0 = simplified_shape[0];
std::size_t n1 = simplified_shape[1];
sycl::event comp_ev = row_matrix_broadcast_fn(
exec_q, host_tasks, n0, n1, rhs_data, rhs_offset,
lhs_data, lhs_offset, depends);

return std::make_pair(
dpnp::utils::keep_args_alive(exec_q, {lhs, rhs},
host_tasks),
comp_ev);
}
}
}
// C-contiguous matrix (lhs) and a column (rhs): D(N0,N1) +=
// col(N0,1) rhs(col broadcast) strides {1,0}; lhs(C-contig)
// {shape[1],1}
if constexpr (!std::is_same_v<contig_col_matrix_dispatchT,
std::nullptr_t>) {
if (isEqual(simplified_rhs_strides, one_zero_strides) &&
isEqual(simplified_lhs_strides,
{simplified_shape[1], one})) {
auto col_matrix_broadcast_fn =
contig_col_matrix_broadcast_dispatch_table[rhs_typeid]
[lhs_typeid];
if (col_matrix_broadcast_fn != nullptr) {
std::size_t n0 = simplified_shape[0];
std::size_t n1 = simplified_shape[1];
sycl::event comp_ev = col_matrix_broadcast_fn(
exec_q, host_tasks, n0, n1, rhs_data, rhs_offset,
lhs_data, lhs_offset, depends);
return std::make_pair(
dpnp::utils::keep_args_alive(exec_q, {lhs, rhs},
host_tasks),
comp_ev);
}
}
}
// special case of C-contiguous matrix and a row
if (isEqual(simplified_rhs_strides, one_zero_strides) &&
isEqual(simplified_lhs_strides, {one, simplified_shape[0]})) {
Expand Down
36 changes: 36 additions & 0 deletions dpnp/tests/test_binary_ufuncs.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,42 @@ def test_inplace_dtype_explicit(self, dtype1, dtype2):
assert_raises(TypeError, numpy.add, a, b, out=a)
assert_raises(ValueError, dpnp.add, ia, ib, out=ia)

@pytest.mark.parametrize(
"dtype", get_all_dtypes(no_none=True, no_bool=True)
)
@pytest.mark.parametrize("shape", [(4, 4), (3, 5), (100, 37), (513, 256)])
def test_inplace_row_broadcast(self, shape, dtype):
# C-contiguous matrix += row vector, in place (broadcast over rows)
n0, n1 = shape
a = get_abs_array(
numpy.arange(n0 * n1).reshape(shape) % 7 + 1, dtype=dtype
)
row = get_abs_array(numpy.arange(n1) % 3 + 1, dtype=dtype)
ia, irow = dpnp.array(a), dpnp.array(row)

a += row
ia += irow
assert_dtype_allclose(ia, a)

@pytest.mark.parametrize(
"dtype", get_all_dtypes(no_none=True, no_bool=True)
)
@pytest.mark.parametrize("shape", [(4, 4), (3, 5), (100, 37), (513, 256)])
def test_inplace_column_broadcast(self, shape, dtype):
# C-contiguous matrix += column vector, in place (broadcast over columns)
n0, n1 = shape
a = get_abs_array(
numpy.arange(n0 * n1).reshape(shape) % 7 + 1, dtype=dtype
)
col = get_abs_array(
(numpy.arange(n0) % 3 + 1).reshape(n0, 1), dtype=dtype
)
ia, icol = dpnp.array(a), dpnp.array(col)

a += col
ia += icol
assert_dtype_allclose(ia, a)

@pytest.mark.parametrize("shape", [(0,), (15,), (2, 2)])
def test_invalid_shape(self, shape):
a, b = dpnp.arange(10), dpnp.arange(10)
Expand Down
Loading