From 3319dc966f7782a0ef46a40fa9e8c52571e70f2c Mon Sep 17 00:00:00 2001 From: Ilya Maltsev Date: Sun, 14 Dec 2025 19:06:11 +0300 Subject: [PATCH 1/7] feat: mask support in kronecker product for intermediate result matrix --- Source/kronecker/GB_kron.c | 371 ++++++++++++++++++++++++++++++++++- Source/kronecker/GB_kron.h | 3 + Source/kronecker/GB_kroner.c | 3 + Test/test226.m | 14 ++ Test/test227.m | 73 ++++++- 5 files changed, 458 insertions(+), 6 deletions(-) diff --git a/Source/kronecker/GB_kron.c b/Source/kronecker/GB_kron.c index 5c462da512..fa5c5adaed 100644 --- a/Source/kronecker/GB_kron.c +++ b/Source/kronecker/GB_kron.c @@ -23,11 +23,71 @@ GB_Matrix_free (&T) ; \ } +#define GBI(Ai,p,avlen) ((Ai == NULL) ? ((p) % (avlen)) : Ai [p]) + +#define GBB(Ab,p) ((Ab == NULL) ? 1 : Ab [p]) + +#define GBP(Ap,k,avlen) ((Ap == NULL) ? ((k) * (avlen)) : Ap [k]) + +#define GBH(Ah,k) ((Ah == NULL) ? (k) : Ah [k]) + #include "kronecker/GB_kron.h" #include "mxm/GB_mxm.h" #include "transpose/GB_transpose.h" #include "mask/GB_accum_mask.h" +static bool GB_lookup_xoffset ( + GrB_Index* p, + GrB_Matrix A, + GrB_Index row, + GrB_Index col +) +{ + GrB_Index vector = A->is_csc ? col : row ; + GrB_Index coord = A->is_csc ? row : col ; + + if (A->p == NULL) + { + GrB_Index offset = vector * A->vlen + coord ; + if (A->b == NULL || ((int8_t*)A->b)[offset]) + { + *p = A->iso ? 0 : offset ; + return true ; + } + return false ; + } + + int64_t start, end ; + bool res ; + + if (A->h == NULL) + { + start = A->p_is_32 ? ((uint32_t*)A->p)[vector] : ((uint64_t*)A->p)[vector] ; + end = A->p_is_32 ? ((uint32_t*)A->p)[vector + 1] : ((uint64_t*)A->p)[vector + 1] ; + end-- ; + if (start > end) return false ; + res = GB_binary_search(coord, A->i, A->i_is_32, &start, &end) ; + if (res) { *p = A->iso ? 0 : start ; } + return res ; + } + else + { + start = 0 ; end = A->plen - 1 ; + res = GB_binary_search(vector, A->h, A->j_is_32, &start, &end) ; + if (!res) return false ; + int64_t k = start ; + start = A->p_is_32 ? ((uint32_t*)A->p)[k] : ((uint64_t*)A->p)[k] ; + end = A->p_is_32 ? ((uint32_t*)A->p)[k+1] : ((uint64_t*)A->p)[k+1] ; + end-- ; + if (start > end) return false ; + res = GB_binary_search(coord, A->i, A->i_is_32, &start, &end) ; + if (res) { *p = A->iso ? 0 : start ; } + return res ; + } +} + +#include "emult/GB_emult.h" + GrB_Info GB_kron // C = accum (C, kron(A,B)) ( GrB_Matrix C, // input/output matrix for results @@ -104,6 +164,315 @@ GrB_Info GB_kron // C = accum (C, kron(A,B)) // quick return if an empty mask is complemented GB_RETURN_IF_QUICK_MASK (C, C_replace, M, Mask_comp, Mask_struct) ; + // check if it's possible to apply mask immediately in kron + // TODO: make MT of same CSR/CSC format as C + // TODO: MT should have its own 32/64 bitness controls + // TODO: clear MT header + + GrB_Matrix MT = NULL; + if (M != NULL && !Mask_comp) + { + // iterate over mask, count how many elements will be present in MT + // initialize MT->p + + GB_MATRIX_WAIT(M); + + size_t allocated = 0 ; + bool MT_hypersparse = (A->h != NULL) || (B->h != NULL); + int64_t centries ; + uint64_t nvecs ; + centries = 0 ; + nvecs = 0 ; + + uint32_t* MTp32 = NULL ; uint64_t* MTp64 = NULL ; + MTp32 = M->p_is_32 ? GB_calloc_memory (M->vdim + 1, sizeof(uint32_t), &allocated) : NULL ; + MTp64 = M->p_is_32 ? NULL : GB_calloc_memory (M->vdim + 1, sizeof(uint64_t), &allocated) ; + if (MTp32 == NULL && MTp64 == NULL) + { + OUT_OF_MEM_p: + GB_FREE_WORKSPACE ; + return GrB_OUT_OF_MEMORY ; + } + + GrB_Type MTtype = op->ztype ; + const size_t MTsize = MTtype->size ; + GB_void MTscalar [GB_VLA(MTsize)] ; + bool MTiso = GB_emult_iso (MTscalar, MTtype, A, B, op) ; + + GB_Mp_DECLARE(Mp, ) ; + GB_Mp_PTR(Mp, M) ; + + GB_Mh_DECLARE(Mh, ) ; + GB_Mh_PTR(Mh, M) ; + + GB_Mi_DECLARE(Mi, ) ; + GB_Mi_PTR(Mi, M) ; + + GB_cast_function cast_A = NULL ; + GB_cast_function cast_B = NULL ; + + cast_A = GB_cast_factory (op->xtype->code, A->type->code) ; + cast_B = GB_cast_factory (op->ytype->code, B->type->code) ; + + int64_t vlen = M->vlen ; + #pragma omp parallel + { + GrB_Index offset ; + + #pragma omp for reduction(+:nvecs) + for (GrB_Index k = 0 ; k < M->nvec ; k++) + { + GrB_Index j = Mh32 ? GBH (Mh32, k) : GBH (Mh64, k) ; + + int64_t pA_start = Mp32 ? GBP (Mp32, k, vlen) : GBP(Mp64, k, vlen) ; + int64_t pA_end = Mp32 ? GBP (Mp32, k+1, vlen) : GBP(Mp64, k+1, vlen) ; + bool nonempty = false ; + for (GrB_Index p = pA_start ; p < pA_end ; p++) + { + if (!GBB (M->b, p)) continue ; + + int64_t i = Mi32 ? GBI (Mi32, p, vlen) : GBI (Mi64, p, vlen) ; + GrB_Index Mrow = M->is_csc ? i : j ; GrB_Index Mcol = M->is_csc ? j : i ; + + // extract elements from A and B, increment MTp + + if (Mask_struct || (M->iso ? ((int8_t*)M->x)[0] : ((int8_t*)M->x)[p])) + { + GrB_Index arow = A_transpose ? (Mcol / bncols) : (Mrow / bnrows); + GrB_Index acol = A_transpose ? (Mrow / bnrows) : (Mcol / bncols); + + GrB_Index brow = B_transpose ? (Mcol % bncols) : (Mrow % bnrows); + GrB_Index bcol = B_transpose ? (Mrow % bnrows) : (Mcol % bncols); + + bool code = GB_lookup_xoffset(&offset, A, arow, acol) ; + if (!code) + { + continue; + } + + code = GB_lookup_xoffset(&offset, B, brow, bcol) ; + if (!code) + { + continue; + } + + if (M->p_is_32) + { + (MTp32[j])++ ; + } + else + { + (MTp64[j])++ ; + } + nonempty = true ; + } + } + if (nonempty) nvecs++ ; + } + } + + // GB_cumsum for MT->p + + double work = M->vdim ; + int nthreads_max = GB_Context_nthreads_max ( ) ; + double chunk = GB_Context_chunk ( ) ; + int cumsum_threads = GB_nthreads (work, chunk, nthreads_max) ; + M->p_is_32 ? GB_cumsum(MTp32, M->p_is_32, M->vdim, NULL, cumsum_threads, Werk) : + GB_cumsum(MTp64, M->p_is_32, M->vdim, NULL, cumsum_threads, Werk) ; + + centries = M->p_is_32 ? MTp32[M->vdim] : MTp64[M->vdim] ; + + uint32_t* MTi32 = NULL ; uint64_t* MTi64 = NULL; + MTi32 = M->i_is_32 ? GB_malloc_memory (centries, sizeof(uint32_t), &allocated) : NULL ; + MTi64 = M->i_is_32 ? NULL : GB_malloc_memory (centries, sizeof(uint64_t), &allocated) ; + + if (centries > 0 && MTi32 == NULL && MTi64 == NULL) + { + OUT_OF_MEM_i: + if (M->p_is_32) { GB_free_memory (&MTp32, (M->vdim + 1) * sizeof(uint32_t)) ; } + else { GB_free_memory (&MTp64, (M->vdim + 1) * sizeof(uint64_t)) ; } + goto OUT_OF_MEM_p ; + } + + void* MTx = NULL ; + if (!MTiso) + { + MTx = GB_malloc_memory (centries, op->ztype->size, &allocated) ; + } + else + { + MTx = GB_malloc_memory (1, op->ztype->size, &allocated) ; + if (MTx == NULL) goto OUT_OF_MEM_x ; + memcpy (MTx, MTscalar, MTsize) ; + } + + if (centries > 0 && MTx == NULL) + { + OUT_OF_MEM_x: + if (M->i_is_32) { GB_free_memory (&MTi32, centries * sizeof(uint32_t)) ; } + else { GB_free_memory (&MTi64, centries * sizeof (uint64_t)) ; } + goto OUT_OF_MEM_i ; + } + + #pragma omp parallel + { + GrB_Index offset ; + GB_void a_elem[op->xtype->size] ; + GB_void b_elem[op->ytype->size] ; + + #pragma omp for + for (GrB_Index k = 0 ; k < M->nvec ; k++) + { + GrB_Index j = Mh32 ? GBH (Mh32, k) : GBH (Mh64, k) ; + + int64_t pA_start = Mp32 ? GBP (Mp32, k, vlen) : GBP(Mp64, k, vlen) ; + int64_t pA_end = Mp32 ? GBP (Mp32, k+1, vlen) : GBP(Mp64, k+1, vlen) ; + GrB_Index pos = M->p_is_32 ? MTp32[j] : MTp64[j] ; + for (GrB_Index p = pA_start ; p < pA_end ; p++) + { + if (!GBB (M->b, p)) continue ; + + int64_t i = Mi32 ? GBI (Mi32, p, vlen) : GBI (Mi64, p, vlen) ; + GrB_Index Mrow = M->is_csc ? i : j ; GrB_Index Mcol = M->is_csc ? j : i ; + + // extract elements from A and B, + // initialize offset in MTi and MTx, + // get result of op, place it in MTx + + if (Mask_struct || (M->iso ? ((int8_t*)M->x)[0] : ((int8_t*)M->x)[p])) + { + GrB_Index arow = A_transpose ? (Mcol / bncols) : (Mrow / bnrows); + GrB_Index acol = A_transpose ? (Mrow / bnrows) : (Mcol / bncols); + + GrB_Index brow = B_transpose ? (Mcol % bncols) : (Mrow % bnrows); + GrB_Index bcol = B_transpose ? (Mrow % bnrows) : (Mcol % bncols); + + bool code = GB_lookup_xoffset (&offset, A, arow, acol) ; + if (!code) + { + continue; + } + if (!MTiso) + cast_A (a_elem, A->x + offset * A->type->size, A->type->size) ; + + code = GB_lookup_xoffset (&offset, B, brow, bcol) ; + if (!code) + { + continue; + } + if (!MTiso) + cast_B (b_elem, B->x + offset * B->type->size, B->type->size) ; + + if (!MTiso) + { + if (op->binop_function) + { + op->binop_function (MTx + op->ztype->size * pos, a_elem, b_elem) ; + } + else + { + GrB_Index ix, iy, jx, jy ; + ix = A_transpose ? acol : arow ; + iy = A_transpose ? arow : acol ; + jx = B_transpose ? bcol : brow ; + jy = B_transpose ? brow : bcol ; + op->idxbinop_function (MTx + op->ztype->size * pos, a_elem, ix, iy, + b_elem, jx, jy, op->theta) ; + } + } + + if (M->i_is_32) { MTi32[pos] = i ; } else { MTi64[pos] = i ; } + pos++ ; + } + } + } + } + + #undef GBI + #undef GBB + #undef GBP + #undef GBH + + // initialize other fields of MT properly + + MT = NULL ; struct GB_Matrix_opaque MT_header ; + GB_CLEAR_MATRIX_HEADER (MT, &MT_header) ; + GrB_Info MTalloc = GB_new_bix (&MT, op->ztype, vlen, M->vdim, GB_ph_null, M->is_csc, + GxB_SPARSE, false, M->hyper_switch, M->vdim, centries, false, MTiso, + M->p_is_32, M->j_is_32, M->i_is_32) ; + if (MTalloc != GrB_SUCCESS) + { + if (MTiso) { GB_free_memory (&MTx, op->ztype->size) ; } + else { GB_free_memory (&MTx, centries * op->ztype->size) ; } + goto OUT_OF_MEM_x ; + } + + GB_free_memory (&MT->i, MT->i_size) ; + GB_free_memory (&MT->x, MT->x_size) ; + + MT->p = M->p_is_32 ? (void*)MTp32 : (void*)MTp64 ; + MT->i = M->i_is_32 ? (void*)MTi32 : (void*)MTi64 ; + MT->x = MTx ; + + MT->p_size = (M->p_is_32 ? sizeof(uint32_t) : sizeof(uint64_t)) * (M->vdim + 1) ; + MT->i_size = ((M->i_is_32 ? sizeof(uint32_t) : sizeof(uint64_t)) * centries) ; + MT->x_size = MT->iso ? op->ztype->size : op->ztype->size * centries ; + MT->magic = GB_MAGIC ; + MT->nvals = centries ; + MT->nvec_nonempty = nvecs ; + + // transpose and convert to hyper if needed + + if (MT->is_csc != C->is_csc) + { + GrB_Info MTtranspose = GB_transpose_in_place (MT, true, Werk) ; + if (MTtranspose != GrB_SUCCESS) + { + GB_FREE_WORKSPACE ; + GB_Matrix_free (&MT) ; + return MTtranspose ; + } + } + + if (MT_hypersparse) + { + uint32_t* MTh32 = NULL ; uint64_t* MTh64 = NULL ; + if (MT->j_is_32) + { + MTh32 = GB_malloc_memory (MT->vdim, sizeof(uint32_t), &allocated) ; + } + else + { + MTh64 = GB_malloc_memory (MT->vdim, sizeof(uint64_t), &allocated) ; + } + + if (MTh32 == NULL && MTh64 == NULL) + { + GB_FREE_WORKSPACE ; + GB_Matrix_free (&MT) ; + return GrB_OUT_OF_MEMORY ; + } + + #pragma omp parallel for + for (GrB_Index i = 0; i < MT->vdim; i++) + { + if (MT->j_is_32) { MTh32[i] = i ; } else { MTh64[i] = i ; } + } + + MT->h = MTh32 ? (void*)MTh32 : (void*)MTh64 ; + + GrB_Info MThyperprune = GB_hyper_prune (MT, Werk) ; + if (MThyperprune != GrB_SUCCESS) + { + GB_FREE_WORKSPACE ; + GB_Matrix_free (&MT) ; + return MThyperprune ; + } + } + + return (GB_accum_mask (C, NULL, NULL, accum, &MT, C_replace, Mask_comp, Mask_struct, Werk)) ; + } + //-------------------------------------------------------------------------- // transpose A and B if requested //-------------------------------------------------------------------------- @@ -153,7 +522,7 @@ GrB_Info GB_kron // C = accum (C, kron(A,B)) GB_CLEAR_MATRIX_HEADER (T, &T_header) ; GB_OK (GB_kroner (T, T_is_csc, op, flipij, A_transpose ? AT : A, A_is_pattern, - B_transpose ? BT : B, B_is_pattern, Werk)) ; + B_transpose ? BT : B, B_is_pattern, M, Mask_comp, Mask_struct, Werk)) ; GB_FREE_WORKSPACE ; ASSERT_MATRIX_OK (T, "T = kron(A,B)", GB0) ; diff --git a/Source/kronecker/GB_kron.h b/Source/kronecker/GB_kron.h index a6c1a3f3ae..d1628b0899 100644 --- a/Source/kronecker/GB_kron.h +++ b/Source/kronecker/GB_kron.h @@ -37,6 +37,9 @@ GrB_Info GB_kroner // C = kron (A,B) bool A_is_pattern, // true if values of A are not used const GrB_Matrix B, // input matrix bool B_is_pattern, // true if values of B are not used + const GrB_Matrix Mask, + const bool Mask_comp, + const bool Mask_struct, GB_Werk Werk ) ; diff --git a/Source/kronecker/GB_kroner.c b/Source/kronecker/GB_kroner.c index afbb655ee4..4fb265b887 100644 --- a/Source/kronecker/GB_kroner.c +++ b/Source/kronecker/GB_kroner.c @@ -39,6 +39,9 @@ GrB_Info GB_kroner // C = kron (A,B) bool A_is_pattern, // true if values of A are not used const GrB_Matrix B_in, // input matrix bool B_is_pattern, // true if values of B are not used + const GrB_Matrix Mask, + const bool Mask_comp, + const bool Mask_struct, GB_Werk Werk ) { diff --git a/Test/test226.m b/Test/test226.m index 41f77fd3d3..4778425a8a 100644 --- a/Test/test226.m +++ b/Test/test226.m @@ -9,6 +9,8 @@ A.matrix = sprand (5, 10, 0.4) ; B.matrix = ones (3, 2) ; B.iso = true ; +M.matrix = sprandn (15, 20,0.2) ~= 0 ; +MT.matrix = sprandn (9, 4, 20,0.2) ~= 0 ; mult.opname = 'times' ; mult.optype = 'double' ; @@ -18,14 +20,26 @@ C2 = GB_spec_kron (Cin, [ ], [ ], mult, A, B, [ ]) ; GB_spec_compare (C1, C2) ; +C1 = GB_mex_kron (Cin, M, [ ], mult, A, B, [ ]) ; +C2 = GB_spec_kron (Cin, M, [ ], mult, A, B, [ ]) ; +GB_spec_compare (C1, C2) ; + C1 = GB_mex_kron (Cin, [ ], [ ], mult, B, A, [ ]) ; C2 = GB_spec_kron (Cin, [ ], [ ], mult, B, A, [ ]) ; GB_spec_compare (C1, C2) ; +C1 = GB_mex_kron (Cin, M, [ ], mult, B, A, [ ]) ; +C2 = GB_spec_kron (Cin, M, [ ], mult, B, A, [ ]) ; +GB_spec_compare (C1, C2) ; + Cin = sparse (9, 4) ; C1 = GB_mex_kron (Cin, [ ], [ ], mult, B, B, [ ]) ; C2 = GB_spec_kron (Cin, [ ], [ ], mult, B, B, [ ]) ; GB_spec_compare (C1, C2) ; +C1 = GB_mex_kron (Cin, MT, [ ], mult, B, B, [ ]) ; +C2 = GB_spec_kron (Cin, MT, [ ], mult, B, B, [ ]) ; +GB_spec_compare (C1, C2) ; + fprintf ('\ntest226: all tests passed\n') ; diff --git a/Test/test227.m b/Test/test227.m index 98e75da2e2..50dd0153d1 100644 --- a/Test/test227.m +++ b/Test/test227.m @@ -16,6 +16,16 @@ dnt = struct ( 'inp1', 'tran' ) ; dtt = struct ( 'inp0', 'tran', 'inp1', 'tran' ) ; +dnn.mask = 'default' ; +dtn.mask = 'default' ; +dnt.mask = 'default' ; +dtt.mask = 'default' ; + +dnn.outp = 'default' ; +dtn.outp = 'default' ; +dnt.outp = 'default' ; +dtt.outp = 'default' ; + types = { 'int32', 'int64', 'single', 'double' } ; am = 5 ; @@ -23,13 +33,18 @@ bm = 4 ; bn = 2 ; -Ax = sparse (100 * sprandn (am,an, 0.5)) ; -Bx = sparse (100 * sprandn (bm,bn, 0.5)) ; +Ax_temp = 100 * sprandn (am, an, 0.5); +Bx_temp = 100 * sprandn (bm, bn, 0.5); + +Ax = sparse(round(Ax_temp)); +Bx = sparse(round(Bx_temp)); + cm = am * bm ; cn = an * bn ; Cx = sparse (cm,cn) ; -AT = Ax' ; -BT = Bx' ; +Maskmat = sprandn (cm,cn,0.2) ~= 0 ; +ATmat = Ax' ; +BTmat = Bx' ; for k2 = [4 7 45:52 ] for k1 = 1:4 @@ -64,9 +79,19 @@ B.is_csc = B_is_csc ; clear C - C.matrix = Cx ; + C.matrix = sparse (cm,cn) ; C.is_csc = C_is_csc ; + clear AT + AT.matrix = ATmat ; + AT.is_hyper = A_is_hyper ; + AT.is_csc = A.is_csc ; + + clear BT + BT.matrix = BTmat ; + BT.is_hyper = B_is_hyper ; + BT.is_csc = B.is_csc ; + %--------------------------------------- % kron(A,B) %--------------------------------------- @@ -103,6 +128,44 @@ C1 = GB_mex_kron (C, [ ], [ ], op, AT, BT, dtt) ; GB_spec_compare (C0, C1) ; + % tests with Mask + for Mask_is_hyper = 0:1 + for Mask_is_csc = 0:1 + fprintf('*') + + A.is_csc = A_is_csc ; + B.is_csc = B_is_csc ; + A.is_hyper = A_is_hyper ; + B.is_hyper = B_is_hyper ; + + clear M + M.matrix = Maskmat ; + M.is_hyper = Mask_is_hyper ; + M.is_csc = Mask_is_csc; + C.is_csc = C_is_csc ; + + % kron(A, B) with Mask + C0 = GB_spec_kron (C, M, [ ], op, A, B, dnn) ; + fprintf('#') ; + C1 = GB_mex_kron (C, M, [ ], op, A, B, dnn) ; + GB_spec_compare(C0, C1) ; + + % kron(A', B) with Mask + C0 = GB_spec_kron (C, M, [ ], op, AT, B, dtn) ; + C1 = GB_mex_kron (C, M, [ ], op, AT, B, dtn) ; + GB_spec_compare (C0, C1) ; + + % kron(A, B') with Mask + C0 = GB_spec_kron (C, M, [ ], op, A, BT, dnt) ; + C1 = GB_mex_kron (C, M, [ ], op, A, BT, dnt) ; + GB_spec_compare (C0, C1) ; + + % kron(A', B') with Mask + C0 = GB_spec_kron (C, M, [ ], op, AT, BT, dtt) ; + C1 = GB_mex_kron (C, M, [ ], op, AT, BT, dtt) ; + GB_spec_compare (C0, C1) ; + end + end end end end From fc60e3cade5e1985bc69e4eef859821186fabd8f Mon Sep 17 00:00:00 2001 From: Ilya Maltsev Date: Wed, 25 Mar 2026 08:40:12 +0300 Subject: [PATCH 2/7] feat: moved masked kronecker to GB_kroner --- Source/kronecker/GB_kron.c | 263 ++-------------------------- Source/kronecker/GB_kron.h | 4 +- Source/kronecker/GB_kroner.c | 324 ++++++++++++++++++++++++++++++++++- 3 files changed, 337 insertions(+), 254 deletions(-) diff --git a/Source/kronecker/GB_kron.c b/Source/kronecker/GB_kron.c index fa5c5adaed..e5894af475 100644 --- a/Source/kronecker/GB_kron.c +++ b/Source/kronecker/GB_kron.c @@ -169,262 +169,21 @@ GrB_Info GB_kron // C = accum (C, kron(A,B)) // TODO: MT should have its own 32/64 bitness controls // TODO: clear MT header - GrB_Matrix MT = NULL; - if (M != NULL && !Mask_comp) - { - // iterate over mask, count how many elements will be present in MT - // initialize MT->p - - GB_MATRIX_WAIT(M); - + bool Mask_is_applicable = M != NULL && !Mask_comp ; + if (Mask_is_applicable) { + bool MT_hypersparse = (A->h != NULL) || (B->h != NULL) ; size_t allocated = 0 ; - bool MT_hypersparse = (A->h != NULL) || (B->h != NULL); - int64_t centries ; - uint64_t nvecs ; - centries = 0 ; - nvecs = 0 ; - - uint32_t* MTp32 = NULL ; uint64_t* MTp64 = NULL ; - MTp32 = M->p_is_32 ? GB_calloc_memory (M->vdim + 1, sizeof(uint32_t), &allocated) : NULL ; - MTp64 = M->p_is_32 ? NULL : GB_calloc_memory (M->vdim + 1, sizeof(uint64_t), &allocated) ; - if (MTp32 == NULL && MTp64 == NULL) - { - OUT_OF_MEM_p: - GB_FREE_WORKSPACE ; - return GrB_OUT_OF_MEMORY ; - } - - GrB_Type MTtype = op->ztype ; - const size_t MTsize = MTtype->size ; - GB_void MTscalar [GB_VLA(MTsize)] ; - bool MTiso = GB_emult_iso (MTscalar, MTtype, A, B, op) ; - - GB_Mp_DECLARE(Mp, ) ; - GB_Mp_PTR(Mp, M) ; - - GB_Mh_DECLARE(Mh, ) ; - GB_Mh_PTR(Mh, M) ; - - GB_Mi_DECLARE(Mi, ) ; - GB_Mi_PTR(Mi, M) ; - - GB_cast_function cast_A = NULL ; - GB_cast_function cast_B = NULL ; - - cast_A = GB_cast_factory (op->xtype->code, A->type->code) ; - cast_B = GB_cast_factory (op->ytype->code, B->type->code) ; - - int64_t vlen = M->vlen ; - #pragma omp parallel - { - GrB_Index offset ; - - #pragma omp for reduction(+:nvecs) - for (GrB_Index k = 0 ; k < M->nvec ; k++) - { - GrB_Index j = Mh32 ? GBH (Mh32, k) : GBH (Mh64, k) ; - - int64_t pA_start = Mp32 ? GBP (Mp32, k, vlen) : GBP(Mp64, k, vlen) ; - int64_t pA_end = Mp32 ? GBP (Mp32, k+1, vlen) : GBP(Mp64, k+1, vlen) ; - bool nonempty = false ; - for (GrB_Index p = pA_start ; p < pA_end ; p++) - { - if (!GBB (M->b, p)) continue ; - - int64_t i = Mi32 ? GBI (Mi32, p, vlen) : GBI (Mi64, p, vlen) ; - GrB_Index Mrow = M->is_csc ? i : j ; GrB_Index Mcol = M->is_csc ? j : i ; - - // extract elements from A and B, increment MTp - - if (Mask_struct || (M->iso ? ((int8_t*)M->x)[0] : ((int8_t*)M->x)[p])) - { - GrB_Index arow = A_transpose ? (Mcol / bncols) : (Mrow / bnrows); - GrB_Index acol = A_transpose ? (Mrow / bnrows) : (Mcol / bncols); - - GrB_Index brow = B_transpose ? (Mcol % bncols) : (Mrow % bnrows); - GrB_Index bcol = B_transpose ? (Mrow % bnrows) : (Mcol % bncols); - - bool code = GB_lookup_xoffset(&offset, A, arow, acol) ; - if (!code) - { - continue; - } - - code = GB_lookup_xoffset(&offset, B, brow, bcol) ; - if (!code) - { - continue; - } - - if (M->p_is_32) - { - (MTp32[j])++ ; - } - else - { - (MTp64[j])++ ; - } - nonempty = true ; - } - } - if (nonempty) nvecs++ ; - } - } - - // GB_cumsum for MT->p - - double work = M->vdim ; - int nthreads_max = GB_Context_nthreads_max ( ) ; - double chunk = GB_Context_chunk ( ) ; - int cumsum_threads = GB_nthreads (work, chunk, nthreads_max) ; - M->p_is_32 ? GB_cumsum(MTp32, M->p_is_32, M->vdim, NULL, cumsum_threads, Werk) : - GB_cumsum(MTp64, M->p_is_32, M->vdim, NULL, cumsum_threads, Werk) ; - - centries = M->p_is_32 ? MTp32[M->vdim] : MTp64[M->vdim] ; - - uint32_t* MTi32 = NULL ; uint64_t* MTi64 = NULL; - MTi32 = M->i_is_32 ? GB_malloc_memory (centries, sizeof(uint32_t), &allocated) : NULL ; - MTi64 = M->i_is_32 ? NULL : GB_malloc_memory (centries, sizeof(uint64_t), &allocated) ; - - if (centries > 0 && MTi32 == NULL && MTi64 == NULL) - { - OUT_OF_MEM_i: - if (M->p_is_32) { GB_free_memory (&MTp32, (M->vdim + 1) * sizeof(uint32_t)) ; } - else { GB_free_memory (&MTp64, (M->vdim + 1) * sizeof(uint64_t)) ; } - goto OUT_OF_MEM_p ; - } - - void* MTx = NULL ; - if (!MTiso) - { - MTx = GB_malloc_memory (centries, op->ztype->size, &allocated) ; - } - else - { - MTx = GB_malloc_memory (1, op->ztype->size, &allocated) ; - if (MTx == NULL) goto OUT_OF_MEM_x ; - memcpy (MTx, MTscalar, MTsize) ; - } - - if (centries > 0 && MTx == NULL) - { - OUT_OF_MEM_x: - if (M->i_is_32) { GB_free_memory (&MTi32, centries * sizeof(uint32_t)) ; } - else { GB_free_memory (&MTi64, centries * sizeof (uint64_t)) ; } - goto OUT_OF_MEM_i ; - } - - #pragma omp parallel - { - GrB_Index offset ; - GB_void a_elem[op->xtype->size] ; - GB_void b_elem[op->ytype->size] ; - #pragma omp for - for (GrB_Index k = 0 ; k < M->nvec ; k++) - { - GrB_Index j = Mh32 ? GBH (Mh32, k) : GBH (Mh64, k) ; - - int64_t pA_start = Mp32 ? GBP (Mp32, k, vlen) : GBP(Mp64, k, vlen) ; - int64_t pA_end = Mp32 ? GBP (Mp32, k+1, vlen) : GBP(Mp64, k+1, vlen) ; - GrB_Index pos = M->p_is_32 ? MTp32[j] : MTp64[j] ; - for (GrB_Index p = pA_start ; p < pA_end ; p++) - { - if (!GBB (M->b, p)) continue ; - - int64_t i = Mi32 ? GBI (Mi32, p, vlen) : GBI (Mi64, p, vlen) ; - GrB_Index Mrow = M->is_csc ? i : j ; GrB_Index Mcol = M->is_csc ? j : i ; - - // extract elements from A and B, - // initialize offset in MTi and MTx, - // get result of op, place it in MTx - - if (Mask_struct || (M->iso ? ((int8_t*)M->x)[0] : ((int8_t*)M->x)[p])) - { - GrB_Index arow = A_transpose ? (Mcol / bncols) : (Mrow / bnrows); - GrB_Index acol = A_transpose ? (Mrow / bnrows) : (Mcol / bncols); - - GrB_Index brow = B_transpose ? (Mcol % bncols) : (Mrow % bnrows); - GrB_Index bcol = B_transpose ? (Mrow % bnrows) : (Mcol % bncols); - - bool code = GB_lookup_xoffset (&offset, A, arow, acol) ; - if (!code) - { - continue; - } - if (!MTiso) - cast_A (a_elem, A->x + offset * A->type->size, A->type->size) ; - - code = GB_lookup_xoffset (&offset, B, brow, bcol) ; - if (!code) - { - continue; - } - if (!MTiso) - cast_B (b_elem, B->x + offset * B->type->size, B->type->size) ; - - if (!MTiso) - { - if (op->binop_function) - { - op->binop_function (MTx + op->ztype->size * pos, a_elem, b_elem) ; - } - else - { - GrB_Index ix, iy, jx, jy ; - ix = A_transpose ? acol : arow ; - iy = A_transpose ? arow : acol ; - jx = B_transpose ? bcol : brow ; - jy = B_transpose ? brow : bcol ; - op->idxbinop_function (MTx + op->ztype->size * pos, a_elem, ix, iy, - b_elem, jx, jy, op->theta) ; - } - } - - if (M->i_is_32) { MTi32[pos] = i ; } else { MTi64[pos] = i ; } - pos++ ; - } - } - } - } - - #undef GBI - #undef GBB - #undef GBP - #undef GBH - - // initialize other fields of MT properly - - MT = NULL ; struct GB_Matrix_opaque MT_header ; + GrB_Matrix MT = NULL ; struct GB_Matrix_opaque MT_header ; GB_CLEAR_MATRIX_HEADER (MT, &MT_header) ; - GrB_Info MTalloc = GB_new_bix (&MT, op->ztype, vlen, M->vdim, GB_ph_null, M->is_csc, - GxB_SPARSE, false, M->hyper_switch, M->vdim, centries, false, MTiso, - M->p_is_32, M->j_is_32, M->i_is_32) ; - if (MTalloc != GrB_SUCCESS) - { - if (MTiso) { GB_free_memory (&MTx, op->ztype->size) ; } - else { GB_free_memory (&MTx, centries * op->ztype->size) ; } - goto OUT_OF_MEM_x ; - } - - GB_free_memory (&MT->i, MT->i_size) ; - GB_free_memory (&MT->x, MT->x_size) ; - MT->p = M->p_is_32 ? (void*)MTp32 : (void*)MTp64 ; - MT->i = M->i_is_32 ? (void*)MTi32 : (void*)MTi64 ; - MT->x = MTx ; + bool A_is_pattern, B_is_pattern ; + GB_binop_pattern (&A_is_pattern, &B_is_pattern, false, op->opcode) ; - MT->p_size = (M->p_is_32 ? sizeof(uint32_t) : sizeof(uint64_t)) * (M->vdim + 1) ; - MT->i_size = ((M->i_is_32 ? sizeof(uint32_t) : sizeof(uint64_t)) * centries) ; - MT->x_size = MT->iso ? op->ztype->size : op->ztype->size * centries ; - MT->magic = GB_MAGIC ; - MT->nvals = centries ; - MT->nvec_nonempty = nvecs ; + GB_kroner (MT, C->is_csc, op, false, A, A_is_pattern, A_transpose, B, B_is_pattern, B_transpose, + M, Mask_comp, Mask_struct, Werk) ; - // transpose and convert to hyper if needed - - if (MT->is_csc != C->is_csc) - { + if (MT->is_csc != C->is_csc) { GrB_Info MTtranspose = GB_transpose_in_place (MT, true, Werk) ; if (MTtranspose != GrB_SUCCESS) { @@ -521,8 +280,8 @@ GrB_Info GB_kron // C = accum (C, kron(A,B)) GB_CLEAR_MATRIX_HEADER (T, &T_header) ; GB_OK (GB_kroner (T, T_is_csc, op, flipij, - A_transpose ? AT : A, A_is_pattern, - B_transpose ? BT : B, B_is_pattern, M, Mask_comp, Mask_struct, Werk)) ; + A_transpose ? AT : A, A_is_pattern, A_transpose, + B_transpose ? BT : B, B_is_pattern, B_transpose, M, Mask_comp, Mask_struct, Werk)) ; GB_FREE_WORKSPACE ; ASSERT_MATRIX_OK (T, "T = kron(A,B)", GB0) ; diff --git a/Source/kronecker/GB_kron.h b/Source/kronecker/GB_kron.h index d1628b0899..38d7b53057 100644 --- a/Source/kronecker/GB_kron.h +++ b/Source/kronecker/GB_kron.h @@ -35,9 +35,11 @@ GrB_Info GB_kroner // C = kron (A,B) const bool flipij, // if true, i and j are flipped: z=(x,y,j,i) const GrB_Matrix A, // input matrix bool A_is_pattern, // true if values of A are not used + bool A_transpose, const GrB_Matrix B, // input matrix bool B_is_pattern, // true if values of B are not used - const GrB_Matrix Mask, + bool B_transpose, + const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct, GB_Werk Werk diff --git a/Source/kronecker/GB_kroner.c b/Source/kronecker/GB_kroner.c index 4fb265b887..94e88a9b4b 100644 --- a/Source/kronecker/GB_kroner.c +++ b/Source/kronecker/GB_kroner.c @@ -24,6 +24,67 @@ GB_phybix_free (C) ; \ } +#define GBI(Ai,p,avlen) ((Ai == NULL) ? ((p) % (avlen)) : Ai [p]) + +#define GBB(Ab,p) ((Ab == NULL) ? 1 : Ab [p]) + +#define GBP(Ap,k,avlen) ((Ap == NULL) ? ((k) * (avlen)) : Ap [k]) + +#define GBH(Ah,k) ((Ah == NULL) ? (k) : Ah [k]) + +#include "mxm/GB_mxm.h" + +static bool GB_lookup_xoffset ( + GrB_Index* p, + GrB_Matrix A, + GrB_Index row, + GrB_Index col +) +{ + GrB_Index vector = A->is_csc ? col : row ; + GrB_Index coord = A->is_csc ? row : col ; + + if (A->p == NULL) + { + GrB_Index offset = vector * A->vlen + coord ; + if (A->b == NULL || ((int8_t*)A->b)[offset]) + { + *p = A->iso ? 0 : offset ; + return true ; + } + return false ; + } + + int64_t start, end ; + bool res ; + + if (A->h == NULL) + { + start = A->p_is_32 ? ((uint32_t*)A->p)[vector] : ((uint64_t*)A->p)[vector] ; + end = A->p_is_32 ? ((uint32_t*)A->p)[vector + 1] : ((uint64_t*)A->p)[vector + 1] ; + end-- ; + if (start > end) return false ; + res = GB_binary_search(coord, A->i, A->i_is_32, &start, &end) ; + if (res) { *p = A->iso ? 0 : start ; } + return res ; + } + else + { + start = 0 ; end = A->plen - 1 ; + res = GB_binary_search(vector, A->h, A->j_is_32, &start, &end) ; + if (!res) return false ; + int64_t k = start ; + start = A->p_is_32 ? ((uint32_t*)A->p)[k] : ((uint64_t*)A->p)[k] ; + end = A->p_is_32 ? ((uint32_t*)A->p)[k+1] : ((uint64_t*)A->p)[k+1] ; + end-- ; + if (start > end) return false ; + res = GB_binary_search(coord, A->i, A->i_is_32, &start, &end) ; + if (res) { *p = A->iso ? 0 : start ; } + return res ; + } +} + + #include "kronecker/GB_kron.h" #include "emult/GB_emult.h" #include "slice/include/GB_search_for_vector.h" @@ -37,9 +98,11 @@ GrB_Info GB_kroner // C = kron (A,B) const bool flipij, // if true, i and j are flipped: z=(x,y,j,i) const GrB_Matrix A_in, // input matrix bool A_is_pattern, // true if values of A are not used + bool A_transpose, const GrB_Matrix B_in, // input matrix bool B_is_pattern, // true if values of B are not used - const GrB_Matrix Mask, + bool B_transpose, + const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct, GB_Werk Werk @@ -67,6 +130,265 @@ GrB_Info GB_kroner // C = kron (A,B) GB_MATRIX_WAIT (A_in) ; GB_MATRIX_WAIT (B_in) ; + //-------------------------------------------------------------------------- + // can apply mask + //-------------------------------------------------------------------------- + + if (M != NULL && !Mask_comp) + { + GrB_Matrix A = A_in ; + GrB_Matrix B = B_in ; + GrB_Matrix MT = C ; + + GB_MATRIX_WAIT(M); + + int64_t bnrows = (B_transpose) ? GB_NCOLS (B) : GB_NROWS (B) ; + int64_t bncols = (B_transpose) ? GB_NROWS (B) : GB_NCOLS (B) ; + size_t allocated = 0 ; + bool MT_hypersparse = (A->h != NULL) || (B->h != NULL); + int64_t centries ; + uint64_t nvecs ; + centries = 0 ; + nvecs = 0 ; + + uint32_t* MTp32 = NULL ; uint64_t* MTp64 = NULL ; + MTp32 = M->p_is_32 ? GB_calloc_memory (M->vdim + 1, sizeof(uint32_t), &allocated) : NULL ; + MTp64 = M->p_is_32 ? NULL : GB_calloc_memory (M->vdim + 1, sizeof(uint64_t), &allocated) ; + if (MTp32 == NULL && MTp64 == NULL) + { + OUT_OF_MEM_p: + GB_FREE_WORKSPACE ; + return GrB_OUT_OF_MEMORY ; + } + + GrB_Type MTtype = op->ztype ; + const size_t MTsize = MTtype->size ; + GB_void MTscalar [GB_VLA(MTsize)] ; + bool MTiso = GB_emult_iso (MTscalar, MTtype, A, B, op) ; + + GB_Mp_DECLARE(Mp, ) ; + GB_Mp_PTR(Mp, M) ; + + GB_Mh_DECLARE(Mh, ) ; + GB_Mh_PTR(Mh, M) ; + + GB_Mi_DECLARE(Mi, ) ; + GB_Mi_PTR(Mi, M) ; + + GB_cast_function cast_A = NULL ; + GB_cast_function cast_B = NULL ; + + cast_A = GB_cast_factory (op->xtype->code, A->type->code) ; + cast_B = GB_cast_factory (op->ytype->code, B->type->code) ; + + int64_t vlen = M->vlen ; + #pragma omp parallel + { + GrB_Index offset ; + + #pragma omp for reduction(+:nvecs) + for (GrB_Index k = 0 ; k < M->nvec ; k++) + { + GrB_Index j = Mh32 ? GBH (Mh32, k) : GBH (Mh64, k) ; + + int64_t pA_start = Mp32 ? GBP (Mp32, k, vlen) : GBP(Mp64, k, vlen) ; + int64_t pA_end = Mp32 ? GBP (Mp32, k+1, vlen) : GBP(Mp64, k+1, vlen) ; + bool nonempty = false ; + for (GrB_Index p = pA_start ; p < pA_end ; p++) + { + if (!GBB (M->b, p)) continue ; + + int64_t i = Mi32 ? GBI (Mi32, p, vlen) : GBI (Mi64, p, vlen) ; + GrB_Index Mrow = M->is_csc ? i : j ; GrB_Index Mcol = M->is_csc ? j : i ; + + // extract elements from A and B, increment MTp + + if (Mask_struct || (M->iso ? ((int8_t*)M->x)[0] : ((int8_t*)M->x)[p])) + { + GrB_Index arow = A_transpose ? (Mcol / bncols) : (Mrow / bnrows); + GrB_Index acol = A_transpose ? (Mrow / bnrows) : (Mcol / bncols); + + GrB_Index brow = B_transpose ? (Mcol % bncols) : (Mrow % bnrows); + GrB_Index bcol = B_transpose ? (Mrow % bnrows) : (Mcol % bncols); + + bool code = GB_lookup_xoffset(&offset, A, arow, acol) ; + if (!code) + { + continue; + } + + code = GB_lookup_xoffset(&offset, B, brow, bcol) ; + if (!code) + { + continue; + } + + if (M->p_is_32) + { + (MTp32[j])++ ; + } + else + { + (MTp64[j])++ ; + } + nonempty = true ; + } + } + if (nonempty) nvecs++ ; + } + } + + // GB_cumsum for MT->p + + double work = M->vdim ; + int nthreads_max = GB_Context_nthreads_max ( ) ; + double chunk = GB_Context_chunk ( ) ; + int cumsum_threads = GB_nthreads (work, chunk, nthreads_max) ; + M->p_is_32 ? GB_cumsum(MTp32, M->p_is_32, M->vdim, NULL, cumsum_threads, Werk) : + GB_cumsum(MTp64, M->p_is_32, M->vdim, NULL, cumsum_threads, Werk) ; + + centries = M->p_is_32 ? MTp32[M->vdim] : MTp64[M->vdim] ; + + uint32_t* MTi32 = NULL ; uint64_t* MTi64 = NULL; + MTi32 = M->i_is_32 ? GB_malloc_memory (centries, sizeof(uint32_t), &allocated) : NULL ; + MTi64 = M->i_is_32 ? NULL : GB_malloc_memory (centries, sizeof(uint64_t), &allocated) ; + + if (centries > 0 && MTi32 == NULL && MTi64 == NULL) + { + OUT_OF_MEM_i: + if (M->p_is_32) { GB_free_memory (&MTp32, (M->vdim + 1) * sizeof(uint32_t)) ; } + else { GB_free_memory (&MTp64, (M->vdim + 1) * sizeof(uint64_t)) ; } + goto OUT_OF_MEM_p ; + } + + void* MTx = NULL ; + if (!MTiso) + { + MTx = GB_malloc_memory (centries, op->ztype->size, &allocated) ; + } + else + { + MTx = GB_malloc_memory (1, op->ztype->size, &allocated) ; + if (MTx == NULL) goto OUT_OF_MEM_x ; + memcpy (MTx, MTscalar, MTsize) ; + } + + if (centries > 0 && MTx == NULL) + { + OUT_OF_MEM_x: + if (M->i_is_32) { GB_free_memory (&MTi32, centries * sizeof(uint32_t)) ; } + else { GB_free_memory (&MTi64, centries * sizeof (uint64_t)) ; } + goto OUT_OF_MEM_i ; + } + + #pragma omp parallel + { + GrB_Index offset ; + GB_void a_elem[op->xtype->size] ; + GB_void b_elem[op->ytype->size] ; + + #pragma omp for + for (GrB_Index k = 0 ; k < M->nvec ; k++) + { + GrB_Index j = Mh32 ? GBH (Mh32, k) : GBH (Mh64, k) ; + + int64_t pA_start = Mp32 ? GBP (Mp32, k, vlen) : GBP(Mp64, k, vlen) ; + int64_t pA_end = Mp32 ? GBP (Mp32, k+1, vlen) : GBP(Mp64, k+1, vlen) ; + GrB_Index pos = M->p_is_32 ? MTp32[j] : MTp64[j] ; + for (GrB_Index p = pA_start ; p < pA_end ; p++) + { + if (!GBB (M->b, p)) continue ; + + int64_t i = Mi32 ? GBI (Mi32, p, vlen) : GBI (Mi64, p, vlen) ; + GrB_Index Mrow = M->is_csc ? i : j ; GrB_Index Mcol = M->is_csc ? j : i ; + + // extract elements from A and B, + // initialize offset in MTi and MTx, + // get result of op, place it in MTx + + if (Mask_struct || (M->iso ? ((int8_t*)M->x)[0] : ((int8_t*)M->x)[p])) + { + GrB_Index arow = A_transpose ? (Mcol / bncols) : (Mrow / bnrows); + GrB_Index acol = A_transpose ? (Mrow / bnrows) : (Mcol / bncols); + + GrB_Index brow = B_transpose ? (Mcol % bncols) : (Mrow % bnrows); + GrB_Index bcol = B_transpose ? (Mrow % bnrows) : (Mcol % bncols); + + bool code = GB_lookup_xoffset (&offset, A, arow, acol) ; + if (!code) + { + continue; + } + if (!MTiso) + cast_A (a_elem, A->x + offset * A->type->size, A->type->size) ; + + code = GB_lookup_xoffset (&offset, B, brow, bcol) ; + if (!code) + { + continue; + } + if (!MTiso) + cast_B (b_elem, B->x + offset * B->type->size, B->type->size) ; + + if (!MTiso) + { + if (op->binop_function) + { + op->binop_function (MTx + op->ztype->size * pos, a_elem, b_elem) ; + } + else + { + GrB_Index ix, iy, jx, jy ; + ix = A_transpose ? acol : arow ; + iy = A_transpose ? arow : acol ; + jx = B_transpose ? bcol : brow ; + jy = B_transpose ? brow : bcol ; + op->idxbinop_function (MTx + op->ztype->size * pos, a_elem, ix, iy, + b_elem, jx, jy, op->theta) ; + } + } + + if (M->i_is_32) { MTi32[pos] = i ; } else { MTi64[pos] = i ; } + pos++ ; + } + } + } + } + + #undef GBI + #undef GBB + #undef GBP + #undef GBH + + // initialize other fields of MT properly + + GrB_Info MTalloc = GB_new_bix (&MT, op->ztype, vlen, M->vdim, GB_ph_null, M->is_csc, + GxB_SPARSE, false, M->hyper_switch, M->vdim, centries, false, MTiso, + M->p_is_32, M->j_is_32, M->i_is_32) ; + if (MTalloc != GrB_SUCCESS) + { + if (MTiso) { GB_free_memory (&MTx, op->ztype->size) ; } + else { GB_free_memory (&MTx, centries * op->ztype->size) ; } + goto OUT_OF_MEM_x ; + } + + GB_free_memory (&MT->i, MT->i_size) ; + GB_free_memory (&MT->x, MT->x_size) ; + + MT->p = M->p_is_32 ? (void*)MTp32 : (void*)MTp64 ; + MT->i = M->i_is_32 ? (void*)MTi32 : (void*)MTi64 ; + MT->x = MTx ; + + MT->p_size = (M->p_is_32 ? sizeof(uint32_t) : sizeof(uint64_t)) * (M->vdim + 1) ; + MT->i_size = ((M->i_is_32 ? sizeof(uint32_t) : sizeof(uint64_t)) * centries) ; + MT->x_size = MT->iso ? op->ztype->size : op->ztype->size * centries ; + MT->magic = GB_MAGIC ; + MT->nvals = centries ; + MT->nvec_nonempty = nvecs ; + + return GrB_SUCCESS ; + } + //-------------------------------------------------------------------------- // bitmap case: create sparse copies of A and B if they are bitmap //-------------------------------------------------------------------------- From fed196a7d727fbcee86a827fef7412f82fe2fb6b Mon Sep 17 00:00:00 2001 From: Ilya Maltsev Date: Mon, 30 Mar 2026 18:26:32 +0300 Subject: [PATCH 3/7] fix: assigning proper number of threads now --- Source/kronecker/GB_kron.c | 13 +++++++++++-- Source/kronecker/GB_kroner.c | 23 ++++++++++++++--------- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/Source/kronecker/GB_kron.c b/Source/kronecker/GB_kron.c index e5894af475..4fcfc9bf29 100644 --- a/Source/kronecker/GB_kron.c +++ b/Source/kronecker/GB_kron.c @@ -180,8 +180,12 @@ GrB_Info GB_kron // C = accum (C, kron(A,B)) bool A_is_pattern, B_is_pattern ; GB_binop_pattern (&A_is_pattern, &B_is_pattern, false, op->opcode) ; - GB_kroner (MT, C->is_csc, op, false, A, A_is_pattern, A_transpose, B, B_is_pattern, B_transpose, + GrB_Info masked_kroner_info = GB_kroner (MT, C->is_csc, op, false, A, A_is_pattern, A_transpose, B, B_is_pattern, B_transpose, M, Mask_comp, Mask_struct, Werk) ; + if (masked_kroner_info != GrB_SUCCESS) + { + return masked_kroner_info ; + } if (MT->is_csc != C->is_csc) { GrB_Info MTtranspose = GB_transpose_in_place (MT, true, Werk) ; @@ -212,7 +216,12 @@ GrB_Info GB_kron // C = accum (C, kron(A,B)) return GrB_OUT_OF_MEMORY ; } - #pragma omp parallel for + double work = M->vdim ; + int nthreads_max = GB_Context_nthreads_max ( ) ; + double chunk = GB_Context_chunk ( ) ; + int masked_hyper_threads = GB_nthreads (work, chunk, nthreads_max) ; + + #pragma omp parallel for num_threads(masked_hyper_threads) schedule(static) for (GrB_Index i = 0; i < MT->vdim; i++) { if (MT->j_is_32) { MTh32[i] = i ; } else { MTh64[i] = i ; } diff --git a/Source/kronecker/GB_kroner.c b/Source/kronecker/GB_kroner.c index 94e88a9b4b..39b4ab55ed 100644 --- a/Source/kronecker/GB_kroner.c +++ b/Source/kronecker/GB_kroner.c @@ -134,7 +134,7 @@ GrB_Info GB_kroner // C = kron (A,B) // can apply mask //-------------------------------------------------------------------------- - if (M != NULL && !Mask_comp) + if (M != NULL && !Mask_comp) { GrB_Matrix A = A_in ; GrB_Matrix B = B_in ; @@ -181,12 +181,17 @@ GrB_Info GB_kroner // C = kron (A,B) cast_A = GB_cast_factory (op->xtype->code, A->type->code) ; cast_B = GB_cast_factory (op->ytype->code, B->type->code) ; + double work = M->vdim ; + int nthreads_max = GB_Context_nthreads_max ( ) ; + double chunk = GB_Context_chunk ( ) ; + int masked_nthreads = GB_nthreads (work, chunk, nthreads_max) ; + int64_t vlen = M->vlen ; - #pragma omp parallel + #pragma omp parallel num_threads(masked_nthreads) { GrB_Index offset ; - #pragma omp for reduction(+:nvecs) + #pragma omp for reduction(+:nvecs) schedule(static) for (GrB_Index k = 0 ; k < M->nvec ; k++) { GrB_Index j = Mh32 ? GBH (Mh32, k) : GBH (Mh64, k) ; @@ -205,11 +210,11 @@ GrB_Info GB_kroner // C = kron (A,B) if (Mask_struct || (M->iso ? ((int8_t*)M->x)[0] : ((int8_t*)M->x)[p])) { - GrB_Index arow = A_transpose ? (Mcol / bncols) : (Mrow / bnrows); - GrB_Index acol = A_transpose ? (Mrow / bnrows) : (Mcol / bncols); + GrB_Index arow = A_transpose ? (Mcol / bncols) : (Mrow / bnrows) ; + GrB_Index acol = A_transpose ? (Mrow / bnrows) : (Mcol / bncols) ; - GrB_Index brow = B_transpose ? (Mcol % bncols) : (Mrow % bnrows); - GrB_Index bcol = B_transpose ? (Mrow % bnrows) : (Mcol % bncols); + GrB_Index brow = B_transpose ? (Mcol % bncols) : (Mrow % bnrows) ; + GrB_Index bcol = B_transpose ? (Mrow % bnrows) : (Mcol % bncols) ; bool code = GB_lookup_xoffset(&offset, A, arow, acol) ; if (!code) @@ -281,13 +286,13 @@ GrB_Info GB_kroner // C = kron (A,B) goto OUT_OF_MEM_i ; } - #pragma omp parallel + #pragma omp parallel num_threads(masked_nthreads) { GrB_Index offset ; GB_void a_elem[op->xtype->size] ; GB_void b_elem[op->ytype->size] ; - #pragma omp for + #pragma omp for schedule(static) for (GrB_Index k = 0 ; k < M->nvec ; k++) { GrB_Index j = Mh32 ? GBH (Mh32, k) : GBH (Mh64, k) ; From ae5fece920a5a73ab20fa184b3f393f5ff1da077 Mon Sep 17 00:00:00 2001 From: Ilya Maltsev Date: Mon, 30 Mar 2026 18:33:22 +0300 Subject: [PATCH 4/7] fix: fixed redefinition --- Source/kronecker/GB_kroner.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/Source/kronecker/GB_kroner.c b/Source/kronecker/GB_kroner.c index 39b4ab55ed..2644540897 100644 --- a/Source/kronecker/GB_kroner.c +++ b/Source/kronecker/GB_kroner.c @@ -245,12 +245,8 @@ GrB_Info GB_kroner // C = kron (A,B) // GB_cumsum for MT->p - double work = M->vdim ; - int nthreads_max = GB_Context_nthreads_max ( ) ; - double chunk = GB_Context_chunk ( ) ; - int cumsum_threads = GB_nthreads (work, chunk, nthreads_max) ; - M->p_is_32 ? GB_cumsum(MTp32, M->p_is_32, M->vdim, NULL, cumsum_threads, Werk) : - GB_cumsum(MTp64, M->p_is_32, M->vdim, NULL, cumsum_threads, Werk) ; + M->p_is_32 ? GB_cumsum(MTp32, M->p_is_32, M->vdim, NULL, masked_nthreads, Werk) : + GB_cumsum(MTp64, M->p_is_32, M->vdim, NULL, masked_nthreads, Werk) ; centries = M->p_is_32 ? MTp32[M->vdim] : MTp64[M->vdim] ; From 9c0e6aff65b7684982e5fa3e9ed7b7b7e80faf92 Mon Sep 17 00:00:00 2001 From: Ilya Maltsev Date: Wed, 22 Apr 2026 02:39:38 +0300 Subject: [PATCH 5/7] feat: moved implementation to template feat: moved implementation to template feat: moved implementation to template --- .../jit_kernels/include/GB_jit_kernel_proto.h | 5 + Source/jit_wrappers/GB_kroner_jit.c | 9 +- Source/jitifyer/GB_stringify.h | 5 + Source/kronecker/GB_kron.c | 22 +- Source/kronecker/GB_kron.h | 4 +- Source/kronecker/GB_kroner.c | 719 +++++++++--------- .../kronecker/template/GB_kroner_template.c | 393 +++++++--- 7 files changed, 684 insertions(+), 473 deletions(-) diff --git a/Source/jit_kernels/include/GB_jit_kernel_proto.h b/Source/jit_kernels/include/GB_jit_kernel_proto.h index 003e9a578f..37dbe10f33 100644 --- a/Source/jit_kernels/include/GB_jit_kernel_proto.h +++ b/Source/jit_kernels/include/GB_jit_kernel_proto.h @@ -727,7 +727,12 @@ GrB_Info GB_jit_kernel_kroner \ ( \ GrB_Matrix C, \ const GrB_Matrix A, \ + const bool A_transpose, \ const GrB_Matrix B, \ + const bool B_transpose, \ + const GrB_Matrix Mask, \ + const bool Mask_struct, \ + const bool Mask_comp, \ const int nthreads, \ const void *theta, \ const GB_callback_struct *restrict my_callback \ diff --git a/Source/jit_wrappers/GB_kroner_jit.c b/Source/jit_wrappers/GB_kroner_jit.c index 174ec0e37e..1919b47f7d 100644 --- a/Source/jit_wrappers/GB_kroner_jit.c +++ b/Source/jit_wrappers/GB_kroner_jit.c @@ -20,7 +20,12 @@ GrB_Info GB_kroner_jit const GrB_BinaryOp binaryop, const bool flipij, const GrB_Matrix A, + const bool A_transpose, const GrB_Matrix B, + const bool B_transpose, + const GrB_Matrix Mask, + const bool Mask_struct, + const bool Mask_comp, const int nthreads ) { @@ -41,7 +46,7 @@ GrB_Info GB_kroner_jit GB_JIT_KERNEL_KRONER, /* is_ewisemult: */ false, /* C_iso: */ C->iso, /* C_in_iso: */ false, C_sparsity, C->type, C->p_is_32, C->j_is_32, C->i_is_32, - /* M: */ NULL, true, false, binaryop, flipij, false, A, B) ; + /* M: */ Mask, Mask_struct, Mask_comp, binaryop, flipij, false, A, B) ; //-------------------------------------------------------------------------- // get the kernel function pointer, loading or compiling it if needed @@ -60,6 +65,6 @@ GrB_Info GB_kroner_jit #include "include/GB_pedantic_disable.h" GB_jit_dl_function GB_jit_kernel = (GB_jit_dl_function) dl_function ; - return (GB_jit_kernel (C, A, B, nthreads, binaryop->theta, &GB_callback)) ; + return (GB_jit_kernel (C, A, A_transpose, B, B_transpose, Mask, Mask_struct, Mask_comp, nthreads, binaryop->theta, &GB_callback)) ; } diff --git a/Source/jitifyer/GB_stringify.h b/Source/jitifyer/GB_stringify.h index df349a87ca..e2b0a0b46a 100644 --- a/Source/jitifyer/GB_stringify.h +++ b/Source/jitifyer/GB_stringify.h @@ -1855,7 +1855,12 @@ GrB_Info GB_kroner_jit const GrB_BinaryOp binaryop, const bool flipij, const GrB_Matrix A, + const bool A_transpose, const GrB_Matrix B, + const bool B_transpose, + const GrB_Matrix Mask, + const bool Mask_struct, + const bool Mask_comp, const int nthreads ) ; diff --git a/Source/kronecker/GB_kron.c b/Source/kronecker/GB_kron.c index 4fcfc9bf29..7e414d7fbe 100644 --- a/Source/kronecker/GB_kron.c +++ b/Source/kronecker/GB_kron.c @@ -37,7 +37,7 @@ #include "mask/GB_accum_mask.h" static bool GB_lookup_xoffset ( - GrB_Index* p, + GrB_Index *p, GrB_Matrix A, GrB_Index row, GrB_Index col @@ -49,7 +49,7 @@ static bool GB_lookup_xoffset ( if (A->p == NULL) { GrB_Index offset = vector * A->vlen + coord ; - if (A->b == NULL || ((int8_t*)A->b)[offset]) + if (A->b == NULL || ((int8_t *)A->b)[offset]) { *p = A->iso ? 0 : offset ; return true ; @@ -62,8 +62,8 @@ static bool GB_lookup_xoffset ( if (A->h == NULL) { - start = A->p_is_32 ? ((uint32_t*)A->p)[vector] : ((uint64_t*)A->p)[vector] ; - end = A->p_is_32 ? ((uint32_t*)A->p)[vector + 1] : ((uint64_t*)A->p)[vector + 1] ; + start = A->p_is_32 ? ((uint32_t *)A->p)[vector] : ((uint64_t *)A->p)[vector] ; + end = A->p_is_32 ? ((uint32_t *)A->p)[vector + 1] : ((uint64_t *)A->p)[vector + 1] ; end-- ; if (start > end) return false ; res = GB_binary_search(coord, A->i, A->i_is_32, &start, &end) ; @@ -76,8 +76,8 @@ static bool GB_lookup_xoffset ( res = GB_binary_search(vector, A->h, A->j_is_32, &start, &end) ; if (!res) return false ; int64_t k = start ; - start = A->p_is_32 ? ((uint32_t*)A->p)[k] : ((uint64_t*)A->p)[k] ; - end = A->p_is_32 ? ((uint32_t*)A->p)[k+1] : ((uint64_t*)A->p)[k+1] ; + start = A->p_is_32 ? ((uint32_t *)A->p)[k] : ((uint64_t *)A->p)[k] ; + end = A->p_is_32 ? ((uint32_t *)A->p)[k+1] : ((uint64_t *)A->p)[k+1] ; end-- ; if (start > end) return false ; res = GB_binary_search(coord, A->i, A->i_is_32, &start, &end) ; @@ -92,7 +92,7 @@ GrB_Info GB_kron // C = accum (C, kron(A,B)) ( GrB_Matrix C, // input/output matrix for results const bool C_replace, // if true, clear C before writing to it - const GrB_Matrix M, // optional mask for C, unused if NULL + const GrB_Matrix Mask, // optional mask for C, unused if NULL const bool Mask_comp, // if true, use !M const bool Mask_struct, // if true, use the only structure of M const GrB_BinaryOp accum, // optional accum for Z=accum(C,T) @@ -111,6 +111,8 @@ GrB_Info GB_kron // C = accum (C, kron(A,B)) // C may be aliased with M, A, and/or B + GrB_Matrix M = Mask ; + GrB_Info info ; struct GB_Matrix_opaque T_header, AT_header, BT_header ; GrB_Matrix T = NULL, AT = NULL, BT = NULL ; @@ -165,9 +167,7 @@ GrB_Info GB_kron // C = accum (C, kron(A,B)) GB_RETURN_IF_QUICK_MASK (C, C_replace, M, Mask_comp, Mask_struct) ; // check if it's possible to apply mask immediately in kron - // TODO: make MT of same CSR/CSC format as C // TODO: MT should have its own 32/64 bitness controls - // TODO: clear MT header bool Mask_is_applicable = M != NULL && !Mask_comp ; if (Mask_is_applicable) { @@ -199,7 +199,7 @@ GrB_Info GB_kron // C = accum (C, kron(A,B)) if (MT_hypersparse) { - uint32_t* MTh32 = NULL ; uint64_t* MTh64 = NULL ; + uint32_t *MTh32 = NULL ; uint64_t *MTh64 = NULL ; if (MT->j_is_32) { MTh32 = GB_malloc_memory (MT->vdim, sizeof(uint32_t), &allocated) ; @@ -227,7 +227,7 @@ GrB_Info GB_kron // C = accum (C, kron(A,B)) if (MT->j_is_32) { MTh32[i] = i ; } else { MTh64[i] = i ; } } - MT->h = MTh32 ? (void*)MTh32 : (void*)MTh64 ; + MT->h = MTh32 ? (void *)MTh32 : (void *)MTh64 ; GrB_Info MThyperprune = GB_hyper_prune (MT, Werk) ; if (MThyperprune != GrB_SUCCESS) diff --git a/Source/kronecker/GB_kron.h b/Source/kronecker/GB_kron.h index 38d7b53057..a25b0cfa15 100644 --- a/Source/kronecker/GB_kron.h +++ b/Source/kronecker/GB_kron.h @@ -15,7 +15,7 @@ GrB_Info GB_kron // C = accum (C, kron(A,B)) ( GrB_Matrix C, // input/output matrix for results const bool C_replace, // if true, clear C before writing to it - const GrB_Matrix M, // optional mask for C, unused if NULL + const GrB_Matrix Mask, // optional mask for C, unused if NULL const bool Mask_comp, // if true, use !M const bool Mask_struct, // if true, use the only structure of M const GrB_BinaryOp accum, // optional accum for Z=accum(C,T) @@ -39,7 +39,7 @@ GrB_Info GB_kroner // C = kron (A,B) const GrB_Matrix B, // input matrix bool B_is_pattern, // true if values of B are not used bool B_transpose, - const GrB_Matrix M, + const GrB_Matrix Mask, const bool Mask_comp, const bool Mask_struct, GB_Werk Werk diff --git a/Source/kronecker/GB_kroner.c b/Source/kronecker/GB_kroner.c index 2644540897..fabfd2ceb2 100644 --- a/Source/kronecker/GB_kroner.c +++ b/Source/kronecker/GB_kroner.c @@ -25,17 +25,23 @@ } #define GBI(Ai,p,avlen) ((Ai == NULL) ? ((p) % (avlen)) : Ai [p]) - #define GBB(Ab,p) ((Ab == NULL) ? 1 : Ab [p]) - #define GBP(Ap,k,avlen) ((Ap == NULL) ? ((k) * (avlen)) : Ap [k]) - #define GBH(Ah,k) ((Ah == NULL) ? (k) : Ah [k]) #include "mxm/GB_mxm.h" +#include "kronecker/GB_kron.h" +#include "emult/GB_emult.h" +#include "slice/include/GB_search_for_vector.h" +#include "jitifyer/GB_stringify.h" + +//------------------------------------------------------------------------------ +// GB_lookup_xoffset: find the offset of (row,col) in a Ax if present +//------------------------------------------------------------------------------ -static bool GB_lookup_xoffset ( - GrB_Index* p, +static bool GB_lookup_xoffset +( + GrB_Index *p, GrB_Matrix A, GrB_Index row, GrB_Index col @@ -47,7 +53,7 @@ static bool GB_lookup_xoffset ( if (A->p == NULL) { GrB_Index offset = vector * A->vlen + coord ; - if (A->b == NULL || ((int8_t*)A->b)[offset]) + if (A->b == NULL || ((int8_t *) A->b) [offset]) { *p = A->iso ? 0 : offset ; return true ; @@ -60,37 +66,145 @@ static bool GB_lookup_xoffset ( if (A->h == NULL) { - start = A->p_is_32 ? ((uint32_t*)A->p)[vector] : ((uint64_t*)A->p)[vector] ; - end = A->p_is_32 ? ((uint32_t*)A->p)[vector + 1] : ((uint64_t*)A->p)[vector + 1] ; + start = A->p_is_32 ? ((uint32_t *) A->p) [vector] + : ((uint64_t *) A->p) [vector] ; + end = A->p_is_32 ? ((uint32_t *) A->p) [vector + 1] + : ((uint64_t *) A->p) [vector + 1] ; end-- ; if (start > end) return false ; - res = GB_binary_search(coord, A->i, A->i_is_32, &start, &end) ; + res = GB_binary_search (coord, A->i, A->i_is_32, &start, &end) ; if (res) { *p = A->iso ? 0 : start ; } return res ; } else { start = 0 ; end = A->plen - 1 ; - res = GB_binary_search(vector, A->h, A->j_is_32, &start, &end) ; + res = GB_binary_search (vector, A->h, A->j_is_32, &start, &end) ; if (!res) return false ; int64_t k = start ; - start = A->p_is_32 ? ((uint32_t*)A->p)[k] : ((uint64_t*)A->p)[k] ; - end = A->p_is_32 ? ((uint32_t*)A->p)[k+1] : ((uint64_t*)A->p)[k+1] ; + start = A->p_is_32 ? ((uint32_t *) A->p) [k] + : ((uint64_t *) A->p) [k] ; + end = A->p_is_32 ? ((uint32_t *) A->p) [k + 1] + : ((uint64_t *) A->p) [k + 1] ; end-- ; if (start > end) return false ; - res = GB_binary_search(coord, A->i, A->i_is_32, &start, &end) ; + res = GB_binary_search (coord, A->i, A->i_is_32, &start, &end) ; if (res) { *p = A->iso ? 0 : start ; } return res ; } } +//------------------------------------------------------------------------------ +// GB_kroner_generic: generic kernel for masked and default paths +//------------------------------------------------------------------------------ -#include "kronecker/GB_kron.h" -#include "emult/GB_emult.h" -#include "slice/include/GB_search_for_vector.h" -#include "jitifyer/GB_stringify.h" +static GrB_Info GB_kroner_generic +( + GrB_Matrix C, + const GrB_BinaryOp op, + const bool flipij, + const GrB_Matrix A, + const GrB_Matrix B, + bool A_is_pattern, + bool B_is_pattern, + bool A_transpose, + bool B_transpose, + const GrB_Matrix Mask, + const bool Mask_comp, + const bool Mask_struct, + const bool C_is_full, + const int nthreads +) +{ + GrB_Type ctype = op->ztype ; + const size_t csize = ctype->size ; -GrB_Info GB_kroner // C = kron (A,B) + GB_Ap_DECLARE (Ap, const) ; GB_Ap_PTR (Ap, A) ; + GB_Ah_DECLARE (Ah, const) ; GB_Ah_PTR (Ah, A) ; + const int64_t avlen = A->vlen ; + + GB_Bp_DECLARE (Bp, const) ; GB_Bp_PTR (Bp, B) ; + GB_Bh_DECLARE (Bh, const) ; GB_Bh_PTR (Bh, B) ; + const int64_t bvlen = B->vlen ; + const int64_t bnvec = B->nvec ; + + GB_Cp_DECLARE (Cp, ) ; GB_Cp_PTR (Cp, C) ; + const int64_t cnvec = C->nvec ; + const int64_t cvlen = C->vlen ; + const int64_t cnz = C->nvals ; + + #define GB_NO_MASK (Mask == NULL) + #define GB_MASK_STRUCT Mask_struct + #define GB_MASK_COMP Mask_comp + #define GB_Cp_IS_32 C->p_is_32 + #define GB_A_TYPE GB_void + #define GB_B_TYPE GB_void + #define GB_C_TYPE GB_void + #define GB_A_ISO A_iso + #define GB_B_ISO B_iso + #define GB_C_ISO C_iso + #define GB_C_IS_FULL C_is_full + + const bool A_iso = A->iso ; + const bool B_iso = B->iso ; + const bool C_iso = C->iso ; + const int64_t asize = A->type->size ; + const int64_t bsize = B->type->size ; + + GxB_binary_function fmult = op->binop_function ; + GxB_index_binary_function fmult_idx = op->idxbinop_function ; + const void *theta = op->theta ; + + GB_cast_function cast_A = NULL, cast_B = NULL ; + if (!A_is_pattern) + cast_A = GB_cast_factory (op->xtype->code, A->type->code) ; + if (!B_is_pattern) + cast_B = GB_cast_factory (op->ytype->code, B->type->code) ; + + #define GB_DECLAREA(a) GB_void a [GB_VLA(asize)] + #define GB_DECLAREB(b) GB_void b [GB_VLA(bsize)] + + #define GB_GETA(a,Ax,p,iso) \ + { \ + if (!A_is_pattern) \ + cast_A (a, Ax + (p)*asize, asize) ; \ + } + + #define GB_GETB(b,Bx,p,iso) \ + { \ + if (!B_is_pattern) \ + cast_B (b, Bx + (p)*bsize, bsize) ; \ + } + + #define GB_KRONECKER_OP(Cx,pC,a,ix,jx,b,iy,jy) \ + { \ + if (fmult != NULL) \ + { \ + fmult (Cx + (pC)*csize, a, b) ; \ + } \ + else \ + { \ + if (flipij) \ + fmult_idx (Cx + (pC)*csize, \ + a, jx, ix, b, jy, iy, theta) ; \ + else \ + fmult_idx (Cx + (pC)*csize, \ + a, ix, jx, b, iy, jy, theta) ; \ + } \ + } + + #define GB_GENERIC + #include "ewise/include/GB_ewise_shared_definitions.h" + #include "kronecker/template/GB_kroner_template.c" + + return GrB_SUCCESS ; +} + +//------------------------------------------------------------------------------ +// GB_kroner: Kronecker product, C = kron (A,B) +//------------------------------------------------------------------------------ + +GrB_Info GB_kroner ( GrB_Matrix C, // output matrix const bool C_is_csc, // desired format of C @@ -102,7 +216,7 @@ GrB_Info GB_kroner // C = kron (A,B) const GrB_Matrix B_in, // input matrix bool B_is_pattern, // true if values of B are not used bool B_transpose, - const GrB_Matrix M, + const GrB_Matrix Mask, const bool Mask_comp, const bool Mask_struct, GB_Werk Werk @@ -131,111 +245,112 @@ GrB_Info GB_kroner // C = kron (A,B) GB_MATRIX_WAIT (B_in) ; //-------------------------------------------------------------------------- - // can apply mask + // common definitions for masked and default //-------------------------------------------------------------------------- - if (M != NULL && !Mask_comp) - { - GrB_Matrix A = A_in ; - GrB_Matrix B = B_in ; - GrB_Matrix MT = C ; + GrB_Type ctype = op->ztype ; + const size_t csize = ctype->size ; + GB_void cscalar [GB_VLA(csize)] ; + bool C_iso = GB_emult_iso (cscalar, ctype, A_in, B_in, op) ; - GB_MATRIX_WAIT(M); + GrB_Matrix A = A_in ; + GrB_Matrix B = B_in ; + + //-------------------------------------------------------------------------- + // apply mask immediately if possible + //-------------------------------------------------------------------------- - int64_t bnrows = (B_transpose) ? GB_NCOLS (B) : GB_NROWS (B) ; - int64_t bncols = (B_transpose) ? GB_NROWS (B) : GB_NCOLS (B) ; + if (Mask != NULL && !Mask_comp) + { + GB_MATRIX_WAIT (Mask) ; + + int64_t bnrows = B_transpose ? GB_NCOLS (B) : GB_NROWS (B) ; + int64_t bncols = B_transpose ? GB_NROWS (B) : GB_NCOLS (B) ; size_t allocated = 0 ; - bool MT_hypersparse = (A->h != NULL) || (B->h != NULL); - int64_t centries ; - uint64_t nvecs ; - centries = 0 ; - nvecs = 0 ; - - uint32_t* MTp32 = NULL ; uint64_t* MTp64 = NULL ; - MTp32 = M->p_is_32 ? GB_calloc_memory (M->vdim + 1, sizeof(uint32_t), &allocated) : NULL ; - MTp64 = M->p_is_32 ? NULL : GB_calloc_memory (M->vdim + 1, sizeof(uint64_t), &allocated) ; - if (MTp32 == NULL && MTp64 == NULL) + int64_t centries = 0 ; + uint64_t nvecs = 0 ; + + //---------------------------------------------------------------------- + // allocate Cp + //---------------------------------------------------------------------- + + uint32_t *Cp32 = NULL ; uint64_t *Cp64 = NULL ; + if (Mask->p_is_32) + Cp32 = GB_calloc_memory (Mask->vdim + 1, sizeof (uint32_t), + &allocated) ; + else + Cp64 = GB_calloc_memory (Mask->vdim + 1, sizeof (uint64_t), + &allocated) ; + + if (Cp32 == NULL && Cp64 == NULL) { - OUT_OF_MEM_p: GB_FREE_WORKSPACE ; return GrB_OUT_OF_MEMORY ; } - GrB_Type MTtype = op->ztype ; - const size_t MTsize = MTtype->size ; - GB_void MTscalar [GB_VLA(MTsize)] ; - bool MTiso = GB_emult_iso (MTscalar, MTtype, A, B, op) ; - - GB_Mp_DECLARE(Mp, ) ; - GB_Mp_PTR(Mp, M) ; + GB_Mp_DECLARE (Mp, ) ; GB_Mp_PTR (Mp, Mask) ; + GB_Mh_DECLARE (Mh, ) ; GB_Mh_PTR (Mh, Mask) ; + GB_Mi_DECLARE (Mi, ) ; GB_Mi_PTR (Mi, Mask) ; - GB_Mh_DECLARE(Mh, ) ; - GB_Mh_PTR(Mh, M) ; + GB_cast_function cast_A = GB_cast_factory (op->xtype->code, + A->type->code) ; + GB_cast_function cast_B = GB_cast_factory (op->ytype->code, + B->type->code) ; - GB_Mi_DECLARE(Mi, ) ; - GB_Mi_PTR(Mi, M) ; - - GB_cast_function cast_A = NULL ; - GB_cast_function cast_B = NULL ; - - cast_A = GB_cast_factory (op->xtype->code, A->type->code) ; - cast_B = GB_cast_factory (op->ytype->code, B->type->code) ; - - double work = M->vdim ; + double work = Mask->vdim ; int nthreads_max = GB_Context_nthreads_max ( ) ; double chunk = GB_Context_chunk ( ) ; - int masked_nthreads = GB_nthreads (work, chunk, nthreads_max) ; + int nthreads = GB_nthreads (work, chunk, nthreads_max) ; + + int64_t vlen = Mask->vlen ; + + //---------------------------------------------------------------------- + // count entries per vector + //---------------------------------------------------------------------- - int64_t vlen = M->vlen ; - #pragma omp parallel num_threads(masked_nthreads) + #pragma omp parallel num_threads(nthreads) { GrB_Index offset ; #pragma omp for reduction(+:nvecs) schedule(static) - for (GrB_Index k = 0 ; k < M->nvec ; k++) + for (GrB_Index k = 0 ; k < Mask->nvec ; k++) { GrB_Index j = Mh32 ? GBH (Mh32, k) : GBH (Mh64, k) ; - int64_t pA_start = Mp32 ? GBP (Mp32, k, vlen) : GBP(Mp64, k, vlen) ; - int64_t pA_end = Mp32 ? GBP (Mp32, k+1, vlen) : GBP(Mp64, k+1, vlen) ; + int64_t pM_start = Mp32 ? GBP (Mp32, k, vlen) + : GBP (Mp64, k, vlen) ; + int64_t pM_end = Mp32 ? GBP (Mp32, k+1, vlen) + : GBP (Mp64, k+1, vlen) ; bool nonempty = false ; - for (GrB_Index p = pA_start ; p < pA_end ; p++) - { - if (!GBB (M->b, p)) continue ; - int64_t i = Mi32 ? GBI (Mi32, p, vlen) : GBI (Mi64, p, vlen) ; - GrB_Index Mrow = M->is_csc ? i : j ; GrB_Index Mcol = M->is_csc ? j : i ; + for (GrB_Index p = pM_start ; p < pM_end ; p++) + { + if (!GBB (Mask->b, p)) continue ; - // extract elements from A and B, increment MTp + int64_t i = Mi32 ? GBI (Mi32, p, vlen) + : GBI (Mi64, p, vlen) ; + GrB_Index Mrow = Mask->is_csc ? i : j ; + GrB_Index Mcol = Mask->is_csc ? j : i ; - if (Mask_struct || (M->iso ? ((int8_t*)M->x)[0] : ((int8_t*)M->x)[p])) + if (Mask_struct || (Mask->iso ? ((bool *) Mask->x) [0] + : ((bool *) Mask->x) [p])) { - GrB_Index arow = A_transpose ? (Mcol / bncols) : (Mrow / bnrows) ; - GrB_Index acol = A_transpose ? (Mrow / bnrows) : (Mcol / bncols) ; - - GrB_Index brow = B_transpose ? (Mcol % bncols) : (Mrow % bnrows) ; - GrB_Index bcol = B_transpose ? (Mrow % bnrows) : (Mcol % bncols) ; - - bool code = GB_lookup_xoffset(&offset, A, arow, acol) ; - if (!code) - { - continue; - } - - code = GB_lookup_xoffset(&offset, B, brow, bcol) ; - if (!code) - { - continue; - } - - if (M->p_is_32) - { - (MTp32[j])++ ; - } - else - { - (MTp64[j])++ ; - } + GrB_Index arow = A_transpose ? (Mcol / bncols) + : (Mrow / bnrows) ; + GrB_Index acol = A_transpose ? (Mrow / bnrows) + : (Mcol / bncols) ; + GrB_Index brow = B_transpose ? (Mcol % bncols) + : (Mrow % bnrows) ; + GrB_Index bcol = B_transpose ? (Mrow % bnrows) + : (Mcol % bncols) ; + + if (!GB_lookup_xoffset (&offset, A, arow, acol)) + continue ; + if (!GB_lookup_xoffset (&offset, B, brow, bcol)) + continue ; + + if (Mask->p_is_32) { (Cp32 [j])++ ; } + else { (Cp64 [j])++ ; } nonempty = true ; } } @@ -243,160 +358,147 @@ GrB_Info GB_kroner // C = kron (A,B) } } - // GB_cumsum for MT->p - - M->p_is_32 ? GB_cumsum(MTp32, M->p_is_32, M->vdim, NULL, masked_nthreads, Werk) : - GB_cumsum(MTp64, M->p_is_32, M->vdim, NULL, masked_nthreads, Werk) ; + //---------------------------------------------------------------------- + // prefix sum to get centries + //---------------------------------------------------------------------- - centries = M->p_is_32 ? MTp32[M->vdim] : MTp64[M->vdim] ; + if (Mask->p_is_32) + GB_cumsum (Cp32, Mask->p_is_32, Mask->vdim, NULL, nthreads, Werk) ; + else + GB_cumsum (Cp64, Mask->p_is_32, Mask->vdim, NULL, nthreads, Werk) ; - uint32_t* MTi32 = NULL ; uint64_t* MTi64 = NULL; - MTi32 = M->i_is_32 ? GB_malloc_memory (centries, sizeof(uint32_t), &allocated) : NULL ; - MTi64 = M->i_is_32 ? NULL : GB_malloc_memory (centries, sizeof(uint64_t), &allocated) ; + centries = Mask->p_is_32 ? (int64_t) Cp32 [Mask->vdim] + : (int64_t) Cp64 [Mask->vdim] ; - if (centries > 0 && MTi32 == NULL && MTi64 == NULL) - { - OUT_OF_MEM_i: - if (M->p_is_32) { GB_free_memory (&MTp32, (M->vdim + 1) * sizeof(uint32_t)) ; } - else { GB_free_memory (&MTp64, (M->vdim + 1) * sizeof(uint64_t)) ; } - goto OUT_OF_MEM_p ; - } + //---------------------------------------------------------------------- + // allocate Ci + //---------------------------------------------------------------------- - void* MTx = NULL ; - if (!MTiso) - { - MTx = GB_malloc_memory (centries, op->ztype->size, &allocated) ; - } + uint32_t *Ci32 = NULL ; uint64_t *Ci64 = NULL ; + if (Mask->i_is_32) + Ci32 = GB_malloc_memory (centries, sizeof (uint32_t), &allocated) ; else + Ci64 = GB_malloc_memory (centries, sizeof (uint64_t), &allocated) ; + + if (centries > 0 && Ci32 == NULL && Ci64 == NULL) { - MTx = GB_malloc_memory (1, op->ztype->size, &allocated) ; - if (MTx == NULL) goto OUT_OF_MEM_x ; - memcpy (MTx, MTscalar, MTsize) ; + if (Mask->p_is_32) GB_free_memory (&Cp32, + (Mask->vdim + 1) * sizeof (uint32_t)) ; + else GB_free_memory (&Cp64, + (Mask->vdim + 1) * sizeof (uint64_t)) ; + GB_FREE_WORKSPACE ; + return GrB_OUT_OF_MEMORY ; } - if (centries > 0 && MTx == NULL) + //---------------------------------------------------------------------- + // allocate Cx + //---------------------------------------------------------------------- + + void *Cx = NULL ; + if (C_iso) { - OUT_OF_MEM_x: - if (M->i_is_32) { GB_free_memory (&MTi32, centries * sizeof(uint32_t)) ; } - else { GB_free_memory (&MTi64, centries * sizeof (uint64_t)) ; } - goto OUT_OF_MEM_i ; + Cx = GB_malloc_memory (1, op->ztype->size, &allocated) ; + if (Cx == NULL) + { + if (Mask->i_is_32) GB_free_memory (&Ci32, + centries * sizeof (uint32_t)) ; + else GB_free_memory (&Ci64, + centries * sizeof (uint64_t)) ; + if (Mask->p_is_32) GB_free_memory (&Cp32, + (Mask->vdim + 1) * sizeof (uint32_t)) ; + else GB_free_memory (&Cp64, + (Mask->vdim + 1) * sizeof (uint64_t)) ; + GB_FREE_WORKSPACE ; + return GrB_OUT_OF_MEMORY ; + } + memcpy (Cx, cscalar, csize) ; } - - #pragma omp parallel num_threads(masked_nthreads) + else { - GrB_Index offset ; - GB_void a_elem[op->xtype->size] ; - GB_void b_elem[op->ytype->size] ; - - #pragma omp for schedule(static) - for (GrB_Index k = 0 ; k < M->nvec ; k++) + Cx = GB_malloc_memory (centries, op->ztype->size, &allocated) ; + if (centries > 0 && Cx == NULL) { - GrB_Index j = Mh32 ? GBH (Mh32, k) : GBH (Mh64, k) ; - - int64_t pA_start = Mp32 ? GBP (Mp32, k, vlen) : GBP(Mp64, k, vlen) ; - int64_t pA_end = Mp32 ? GBP (Mp32, k+1, vlen) : GBP(Mp64, k+1, vlen) ; - GrB_Index pos = M->p_is_32 ? MTp32[j] : MTp64[j] ; - for (GrB_Index p = pA_start ; p < pA_end ; p++) - { - if (!GBB (M->b, p)) continue ; - - int64_t i = Mi32 ? GBI (Mi32, p, vlen) : GBI (Mi64, p, vlen) ; - GrB_Index Mrow = M->is_csc ? i : j ; GrB_Index Mcol = M->is_csc ? j : i ; - - // extract elements from A and B, - // initialize offset in MTi and MTx, - // get result of op, place it in MTx - - if (Mask_struct || (M->iso ? ((int8_t*)M->x)[0] : ((int8_t*)M->x)[p])) - { - GrB_Index arow = A_transpose ? (Mcol / bncols) : (Mrow / bnrows); - GrB_Index acol = A_transpose ? (Mrow / bnrows) : (Mcol / bncols); - - GrB_Index brow = B_transpose ? (Mcol % bncols) : (Mrow % bnrows); - GrB_Index bcol = B_transpose ? (Mrow % bnrows) : (Mcol % bncols); - - bool code = GB_lookup_xoffset (&offset, A, arow, acol) ; - if (!code) - { - continue; - } - if (!MTiso) - cast_A (a_elem, A->x + offset * A->type->size, A->type->size) ; - - code = GB_lookup_xoffset (&offset, B, brow, bcol) ; - if (!code) - { - continue; - } - if (!MTiso) - cast_B (b_elem, B->x + offset * B->type->size, B->type->size) ; - - if (!MTiso) - { - if (op->binop_function) - { - op->binop_function (MTx + op->ztype->size * pos, a_elem, b_elem) ; - } - else - { - GrB_Index ix, iy, jx, jy ; - ix = A_transpose ? acol : arow ; - iy = A_transpose ? arow : acol ; - jx = B_transpose ? bcol : brow ; - jy = B_transpose ? brow : bcol ; - op->idxbinop_function (MTx + op->ztype->size * pos, a_elem, ix, iy, - b_elem, jx, jy, op->theta) ; - } - } - - if (M->i_is_32) { MTi32[pos] = i ; } else { MTi64[pos] = i ; } - pos++ ; - } - } + if (Mask->i_is_32) GB_free_memory (&Ci32, + centries * sizeof (uint32_t)) ; + else GB_free_memory (&Ci64, + centries * sizeof (uint64_t)) ; + if (Mask->p_is_32) GB_free_memory (&Cp32, + (Mask->vdim + 1) * sizeof (uint32_t)) ; + else GB_free_memory (&Cp64, + (Mask->vdim + 1) * sizeof (uint64_t)) ; + GB_FREE_WORKSPACE ; + return GrB_OUT_OF_MEMORY ; } } - #undef GBI - #undef GBB - #undef GBP - #undef GBH + //---------------------------------------------------------------------- + // allocate and initialize C matrix header + //---------------------------------------------------------------------- - // initialize other fields of MT properly - - GrB_Info MTalloc = GB_new_bix (&MT, op->ztype, vlen, M->vdim, GB_ph_null, M->is_csc, - GxB_SPARSE, false, M->hyper_switch, M->vdim, centries, false, MTiso, - M->p_is_32, M->j_is_32, M->i_is_32) ; - if (MTalloc != GrB_SUCCESS) + GrB_Info Calloc = GB_new_bix (&C, op->ztype, vlen, Mask->vdim, + GB_ph_null, Mask->is_csc, GxB_SPARSE, false, Mask->hyper_switch, + Mask->vdim, centries, false, C_iso, + Mask->p_is_32, Mask->j_is_32, Mask->i_is_32) ; + if (Calloc != GrB_SUCCESS) { - if (MTiso) { GB_free_memory (&MTx, op->ztype->size) ; } - else { GB_free_memory (&MTx, centries * op->ztype->size) ; } - goto OUT_OF_MEM_x ; + if (C_iso) GB_free_memory (&Cx, op->ztype->size) ; + else GB_free_memory (&Cx, centries * op->ztype->size) ; + if (Mask->i_is_32) GB_free_memory (&Ci32, + centries * sizeof (uint32_t)) ; + else GB_free_memory (&Ci64, + centries * sizeof (uint64_t)) ; + if (Mask->p_is_32) GB_free_memory (&Cp32, + (Mask->vdim + 1) * sizeof (uint32_t)) ; + else GB_free_memory (&Cp64, + (Mask->vdim + 1) * sizeof (uint64_t)) ; + GB_FREE_WORKSPACE ; + return Calloc ; } - GB_free_memory (&MT->i, MT->i_size) ; - GB_free_memory (&MT->x, MT->x_size) ; - - MT->p = M->p_is_32 ? (void*)MTp32 : (void*)MTp64 ; - MT->i = M->i_is_32 ? (void*)MTi32 : (void*)MTi64 ; - MT->x = MTx ; - - MT->p_size = (M->p_is_32 ? sizeof(uint32_t) : sizeof(uint64_t)) * (M->vdim + 1) ; - MT->i_size = ((M->i_is_32 ? sizeof(uint32_t) : sizeof(uint64_t)) * centries) ; - MT->x_size = MT->iso ? op->ztype->size : op->ztype->size * centries ; - MT->magic = GB_MAGIC ; - MT->nvals = centries ; - MT->nvec_nonempty = nvecs ; + GB_free_memory (&C->i, C->i_size) ; + GB_free_memory (&C->x, C->x_size) ; + + C->p = Mask->p_is_32 ? (void *) Cp32 : (void *) Cp64 ; + C->i = Mask->i_is_32 ? (void *) Ci32 : (void *) Ci64 ; + C->x = Cx ; + C->p_size = (Mask->p_is_32 ? sizeof (uint32_t) + : sizeof (uint64_t)) + * (Mask->vdim + 1) ; + C->i_size = (Mask->i_is_32 ? sizeof (uint32_t) + : sizeof (uint64_t)) * centries ; + C->x_size = C->iso ? op->ztype->size + : op->ztype->size * centries ; + C->magic = GB_MAGIC ; + C->nvals = centries ; + C->nvec_nonempty = (int64_t) nvecs ; + + //---------------------------------------------------------------------- + // evaluate + //---------------------------------------------------------------------- + + info = GB_kroner_jit (C, op, false, A, A_transpose, B, B_transpose, Mask, Mask_struct, Mask_comp, nthreads) ; + + if (info != GrB_SUCCESS) + { + info = GB_kroner_generic (C, op, false, A, B, + A_is_pattern, B_is_pattern, A_transpose, B_transpose, + Mask, Mask_comp, Mask_struct, false, nthreads) ; + } - return GrB_SUCCESS ; + GB_FREE_WORKSPACE ; + return info ; } + //-------------------------------------------------------------------------- + // default case (no mask, or complemented mask) + //-------------------------------------------------------------------------- + //-------------------------------------------------------------------------- // bitmap case: create sparse copies of A and B if they are bitmap //-------------------------------------------------------------------------- - GrB_Matrix A = A_in ; if (GB_IS_BITMAP (A)) - { + { GBURBLE ("A:") ; GB_CLEAR_MATRIX_HEADER (Awork, &Awork_header) ; GB_OK (GB_dup_worker (&Awork, A->iso, A, true, NULL)) ; @@ -406,9 +508,8 @@ GrB_Info GB_kroner // C = kron (A,B) A = Awork ; } - GrB_Matrix B = B_in ; if (GB_IS_BITMAP (B)) - { + { GBURBLE ("B:") ; GB_CLEAR_MATRIX_HEADER (Bwork, &Bwork_header) ; GB_OK (GB_dup_worker (&Bwork, B->iso, B, true, NULL)) ; @@ -428,7 +529,7 @@ GrB_Info GB_kroner // C = kron (A,B) const int64_t avlen = A->vlen ; const int64_t avdim = A->vdim ; const int64_t anvec = A->nvec ; - const int64_t anz = GB_nnz (A) ; + const int64_t anz = GB_nnz (A) ; GB_Bp_DECLARE (Bp, const) ; GB_Bp_PTR (Bp, B) ; GB_Bh_DECLARE (Bh, const) ; GB_Bh_PTR (Bh, B) ; @@ -436,7 +537,7 @@ GrB_Info GB_kroner // C = kron (A,B) const int64_t bvlen = B->vlen ; const int64_t bvdim = B->vdim ; const int64_t bnvec = B->nvec ; - const int64_t bnz = GB_nnz (B) ; + const int64_t bnz = GB_nnz (B) ; //-------------------------------------------------------------------------- // determine the number of threads to use @@ -444,57 +545,40 @@ GrB_Info GB_kroner // C = kron (A,B) double work = ((double) anz) * ((double) bnz) + (((double) anvec) * ((double) bnvec)) ; - int nthreads_max = GB_Context_nthreads_max ( ) ; double chunk = GB_Context_chunk ( ) ; int nthreads = GB_nthreads (work, chunk, nthreads_max) ; - //-------------------------------------------------------------------------- - // check if C is iso and compute its iso value if it is - //-------------------------------------------------------------------------- - - GrB_Type ctype = op->ztype ; - const size_t csize = ctype->size ; - GB_void cscalar [GB_VLA(csize)] ; - bool C_iso = GB_emult_iso (cscalar, ctype, A, B, op) ; - //-------------------------------------------------------------------------- // allocate the output matrix C //-------------------------------------------------------------------------- - // C has the same type as z for the multiply operator, z=op(x,y) - uint64_t cvlen, cvdim, cnzmax, cnvec ; - bool ok = GB_int64_multiply (&cvlen, avlen, bvlen) ; - ok = ok & GB_int64_multiply (&cvdim, avdim, bvdim) ; - ok = ok & GB_int64_multiply (&cnzmax, anz, bnz) ; - ok = ok & GB_int64_multiply (&cnvec, anvec, bnvec) ; + bool ok = GB_int64_multiply (&cvlen, avlen, bvlen) ; + ok = ok & GB_int64_multiply (&cvdim, avdim, bvdim) ; + ok = ok & GB_int64_multiply (&cnzmax, anz, bnz) ; + ok = ok & GB_int64_multiply (&cnvec, anvec, bnvec) ; ASSERT (ok) ; if (C_iso) - { - // the values of A and B are no longer needed if C is iso + { GBURBLE ("(iso kron) ") ; A_is_pattern = true ; B_is_pattern = true ; } - // C is hypersparse if either A or B are hypersparse. It is never bitmap. bool C_is_hyper = (cvdim > 1) && (Ah != NULL || Bh != NULL) ; - bool C_is_full = GB_as_if_full (A) && GB_as_if_full (B) ; - int C_sparsity = C_is_full ? GxB_FULL : - ((C_is_hyper) ? GxB_HYPERSPARSE : GxB_SPARSE) ; - - // determine the p_is_32, j_is_32, and i_is_32 settings for the new matrix + bool C_is_full = GB_as_if_full (A) && GB_as_if_full (B) ; + int C_sparsity = C_is_full ? GxB_FULL : + C_is_hyper ? GxB_HYPERSPARSE : GxB_SPARSE ; bool Cp_is_32, Cj_is_32, Ci_is_32 ; GB_determine_pji_is_32 (&Cp_is_32, &Cj_is_32, &Ci_is_32, C_sparsity, cnzmax, (int64_t) cvlen, (int64_t) cvdim, Werk) ; - GB_OK (GB_new_bix (&C, // full, sparse, or hyper; existing header - ctype, (int64_t) cvlen, (int64_t) cvdim, GB_ph_malloc, C_is_csc, - C_sparsity, true, B->hyper_switch, cnvec, cnzmax, true, C_iso, - Cp_is_32, Cj_is_32, Ci_is_32)) ; + GB_OK (GB_new_bix (&C, ctype, (int64_t) cvlen, (int64_t) cvdim, + GB_ph_malloc, C_is_csc, C_sparsity, true, B->hyper_switch, + cnvec, cnzmax, true, C_iso, Cp_is_32, Cj_is_32, Ci_is_32)) ; //-------------------------------------------------------------------------- // compute the column counts of C: Cp and Ch if C is hypersparse @@ -502,34 +586,25 @@ GrB_Info GB_kroner // C = kron (A,B) GB_Cp_DECLARE (Cp, ) ; GB_Cp_PTR (Cp, C) ; GB_Ch_DECLARE (Ch, ) ; GB_Ch_PTR (Ch, C) ; - #define GB_Cp_IS_32 Cp_is_32 + //#define GB_Cp_IS_32 Cp_is_32 if (!C_is_full) - { - // C is sparse or hypersparse + { int64_t kC ; #pragma omp parallel for num_threads(nthreads) schedule(static) - for (kC = 0 ; kC < cnvec ; kC++) + for (kC = 0 ; kC < (int64_t) cnvec ; kC++) { const int64_t kA = kC / bnvec ; const int64_t kB = kC % bnvec ; - // get A(:,jA), the (kA)th vector of A const int64_t jA = GBh_A (Ah, kA) ; const int64_t aknz = (Ap == NULL) ? avlen : (GB_IGET (Ap, kA+1) - GB_IGET (Ap, kA)) ; - // get B(:,jB), the (kB)th vector of B const int64_t jB = GBh_B (Bh, kB) ; const int64_t bknz = (Bp == NULL) ? bvlen : (GB_IGET (Bp, kB+1) - GB_IGET (Bp, kB)) ; - // determine # entries in C(:,jC), the (kC)th vector of C - // int64_t kC = kA * bnvec + kB ; - // Cp [kC] = aknz * bknz ; GB_ISET (Cp, kC, aknz * bknz) ; if (C_is_hyper) - { - // Ch [kC] = jA * bvdim + jB ; GB_ISET (Ch, kC, jA * bvdim + jB) ; - } } int64_t nvec_nonempty ; @@ -542,19 +617,17 @@ GrB_Info GB_kroner // C = kron (A,B) C->magic = GB_MAGIC ; //-------------------------------------------------------------------------- - // C = kron (A,B) where C is iso and/or full full + // C = kron (A,B) where C is iso and/or full //-------------------------------------------------------------------------- if (C_iso) - { - // C->x [0] = cscalar = op (A,B) + { memcpy (C->x, cscalar, csize) ; if (C_is_full) - { - // no more work to do if C is iso and full + { ASSERT_MATRIX_OK (C, "C=kron(A,B), iso full", GB0) ; GB_FREE_WORKSPACE ; - return (GrB_SUCCESS) ; + return GrB_SUCCESS ; } } @@ -564,93 +637,22 @@ GrB_Info GB_kroner // C = kron (A,B) int64_t cnz = GB_nnz (C) ; if (cnz == 0) - { + { GB_FREE_WORKSPACE ; - return (GrB_SUCCESS) ; + return GrB_SUCCESS ; } //-------------------------------------------------------------------------- - // C = kron (A,B) + // evaluate: JIT or generic //-------------------------------------------------------------------------- - // via the JIT kernel - info = GB_kroner_jit (C, op, flipij, A, B, nthreads) ; - - if (info == GrB_NO_VALUE) - { - // via the generic kernel - #define GB_A_TYPE GB_void - #define GB_B_TYPE GB_void - #define GB_C_TYPE GB_void - #define GB_A_ISO A_iso - #define GB_B_ISO B_iso - #define GB_C_ISO C_iso - const bool A_iso = A->iso ; - const bool B_iso = B->iso ; - const int64_t asize = A->type->size ; - const int64_t bsize = B->type->size ; - - GxB_binary_function fmult = op->binop_function ; - GxB_index_binary_function fmult_idx = op->idxbinop_function ; - const void *theta = op->theta ; - GB_cast_function cast_A = NULL, cast_B = NULL ; - if (!A_is_pattern) - { - cast_A = GB_cast_factory (op->xtype->code, A->type->code) ; - } - if (!B_is_pattern) - { - cast_B = GB_cast_factory (op->ytype->code, B->type->code) ; - } - - #define GB_C_IS_FULL C_is_full - - #define GB_DECLAREA(a) GB_void a [GB_VLA(asize)] - #define GB_DECLAREB(b) GB_void b [GB_VLA(bsize)] - - #define GB_GETA(a,Ax,p,iso) \ - { \ - if (!A_is_pattern) \ - { \ - cast_A (a, Ax + (p)*asize, asize) ; \ - } \ - } - - #define GB_GETB(b,Bx,p,iso) \ - { \ - if (!B_is_pattern) \ - { \ - cast_B (b, Bx + (p)*bsize, bsize) ; \ - } \ - } - - #define GB_KRONECKER_OP(Cx,pC,a,ix,jx,b,iy,jy) \ - { \ - if (fmult != NULL) \ - { \ - /* standard binary operator */ \ - fmult (Cx +(pC)*csize, a, b) ; \ - } \ - else \ - { \ - /* index binary operator */ \ - if (flipij) \ - { \ - fmult_idx (Cx +(pC)*csize, \ - a, jx, ix, b, jy, iy, theta) ; \ - } \ - else \ - { \ - fmult_idx (Cx +(pC)*csize, \ - a, ix, jx, b, iy, jy, theta) ; \ - } \ - } \ - } + info = GB_kroner_jit (C, op, flipij, A, A_transpose, B, B_transpose, Mask, Mask_struct, Mask_comp, nthreads) ; - #define GB_GENERIC - #include "ewise/include/GB_ewise_shared_definitions.h" - #include "kronecker/template/GB_kroner_template.c" - info = GrB_SUCCESS ; + if (info != GrB_SUCCESS) + { + info = GB_kroner_generic (C, op, flipij, A, B, + A_is_pattern, B_is_pattern, A_transpose, B_transpose, + Mask, Mask_comp, Mask_struct, C_is_full, nthreads) ; } //-------------------------------------------------------------------------- @@ -658,16 +660,11 @@ GrB_Info GB_kroner // C = kron (A,B) //-------------------------------------------------------------------------- if (info == GrB_SUCCESS) - { + { GB_OK (GB_hyper_prune (C, Werk)) ; ASSERT_MATRIX_OK (C, "C=kron(A,B)", GB0) ; } - //-------------------------------------------------------------------------- - // return result - //-------------------------------------------------------------------------- - GB_FREE_WORKSPACE ; - return (info) ; + return info ; } - diff --git a/Source/kronecker/template/GB_kroner_template.c b/Source/kronecker/template/GB_kroner_template.c index bdc76990c0..c306df0058 100644 --- a/Source/kronecker/template/GB_kroner_template.c +++ b/Source/kronecker/template/GB_kroner_template.c @@ -47,135 +47,334 @@ // C = kron (A,B) //-------------------------------------------------------------------------- - int tid ; - #pragma omp parallel for num_threads(nthreads) schedule(static) - for (tid = 0 ; tid < nthreads ; tid++) + if (!GB_NO_MASK && !GB_MASK_COMP) { + #define GB_GETVECTOR(A, row, col) GrB_Index vector_ = (A)->is_csc ? (col) : (row) + + #define GB_GETCOORD(A, row, col) GrB_Index coord_ = (A)->is_csc ? (row) : (col) + + #define GB_LOOKUP_XOFFSET_BITMAP_FULL(offset, A) \ + { \ + GrB_Index offset_ = vector_ * (A)->vlen + \ + coord_ ; \ + if ((A)->b == NULL || \ + (((int8_t *) (A)->b)[offset_])) \ + { \ + offset = (A)->iso ? 0 : offset_ ; \ + } \ + else \ + { \ + offset = -1 ; \ + } \ + } - //---------------------------------------------------------------------- - // get the iso values of A and B - //---------------------------------------------------------------------- + #define GB_LOOKUP_XOFFSET_SPARSE(offset, A) \ + { \ + start_ = (A)->p_is_32 ? \ + ((int32_t *)(A)->p) [vector_] : \ + ((int64_t *)(A)->p) [vector_] ; \ + end_ = (A)->p_is_32 ? \ + ((int32_t *)(A)->p) [vector_ + 1] : \ + ((int64_t *)(A)->p) [vector_ + 1] ; \ + end_-- ; \ + if (start_ > end_) \ + { \ + offset = -1 ; \ + } \ + else \ + { \ + if (GB_binary_search (coord_, (A)->i, \ + (A)->i_is_32, &start_, &end_)) \ + { \ + offset = (A)->iso ? 0 : start_ ; \ + } \ + else \ + { \ + offset = - 1 ; \ + } \ + } \ + } - GB_DECLAREA (a) ; - if (GB_A_ISO) - { - GB_GETA (a, Ax, 0, true) ; + #define GB_LOOKUP_XOFFSET_HYPER(offset, A) \ + { \ + start_ = 0 ; \ + end_ = (A)->plen - 1 ; \ + if (!GB_binary_search (vector_, (A)->h, \ + (A)->j_is_32, &start_, &end_)) \ + { \ + offset = - 1; \ + } \ + else \ + { \ + int64_t k_ = start_ ; \ + start_ = (A)->p_is_32 ? \ + ((int32_t *)(A)->p) [k_] : \ + ((int64_t *)(A)->p) [k_] ; \ + end_ = (A)->p_is_32 ? \ + ((int32_t *)(A)->p) [k_ + 1] : \ + ((int64_t *)(A)->p) [k_ + 1] ; \ + end_-- ; \ + if (start_ > end_) \ + { \ + offset = -1 ; \ + } \ + else \ + { \ + if (GB_binary_search (coord_, (A)->i, \ + (A)->i_is_32, &start_, &end_)) \ + { \ + offset = (A)->iso ? 0 : start_ ; \ + } \ + else \ + { \ + offset = -1 ; \ + } \ + } \ + } \ } - GB_DECLAREB (b) ; - if (GB_B_ISO) - { - GB_GETB (b, Bx, 0, true) ; + + #define GB_LOOKUP_XOFFSET(offset, A, row, col) \ + { \ + GB_GETVECTOR(A, row, col) ; \ + GB_GETCOORD(A, row, col) ; \ + if ((A)->p == NULL) \ + { \ + GB_LOOKUP_XOFFSET_BITMAP_FULL (offset, A) ; \ + } \ + else \ + { \ + int64_t start_, end_ ; \ + if ((A)->h == NULL) \ + { \ + GB_LOOKUP_XOFFSET_SPARSE(offset, A) ; \ + } \ + else \ + { \ + GB_LOOKUP_XOFFSET_HYPER(offset, A) ; \ + } \ + } \ } - //---------------------------------------------------------------------- - // construct the task to compute Ci,Cx [pC:pC_end-1] - //---------------------------------------------------------------------- + #define GB_NROWS(A) ((A)->is_csc ? (A)->vlen : (A)->vdim) + #define GB_NCOLS(A) ((A)->is_csc ? (A)->vdim : (A)->vlen) + - int64_t pC, pC_end ; - GB_PARTITION (pC, pC_end, cnz, tid, nthreads) ; + GB_Mp_DECLARE(Mp, ) ; + GB_Mp_PTR(Mp, Mask) ; - // find where this task starts in C - int64_t kC_task = GB_search_for_vector (Cp, GB_Cp_IS_32, pC, 0, cnvec, - cvlen) ; - int64_t pC_delta = pC - GBp_C (Cp, kC_task, cvlen) ; + GB_Mh_DECLARE(Mh, ) ; + GB_Mh_PTR(Mh, Mask) ; - //---------------------------------------------------------------------- - // compute C(:,kC) for all vectors kC in this task - //---------------------------------------------------------------------- + GB_Mi_DECLARE(Mi, ) ; + GB_Mi_PTR(Mi, Mask) ; - for (int64_t kC = kC_task ; kC < cnvec && pC < pC_end ; kC++) + + int64_t bnrows = (B_transpose) ? GB_NCOLS (B) : GB_NROWS (B) ; + int64_t bncols = (B_transpose) ? GB_NROWS (B) : GB_NCOLS (B) ; + + #pragma omp parallel num_threads(nthreads) { + GrB_Index offset = -1 ; + GB_DECLAREA (a_elem) ; + GB_DECLAREB (b_elem) ; + int64_t vlen = Mask->vlen ; - //------------------------------------------------------------------ - // get the vectors C(:,jC), A(:,jA), and B(:,jB) - //------------------------------------------------------------------ - - // C(:,jC) = kron (A(:,jA), B(:,jB), the (kC)th vector of C, - // where jC = GBh_C (Ch, kC) - int64_t kA = kC / bnvec ; - int64_t kB = kC % bnvec ; - - // get A(:,jA), the (kA)th vector of A - int64_t jA = GBh_A (Ah, kA) ; - int64_t pA_start = GBp_A (Ap, kA, avlen) ; - int64_t pA_end = GBp_A (Ap, kA+1, avlen) ; - - // get B(:,jB), the (kB)th vector of B - int64_t jB = GBh_B (Bh, kB) ; - int64_t pB_start = GBp_B (Bp, kB, bvlen) ; - int64_t pB_end = GBp_B (Bp, kB+1, bvlen) ; - int64_t bknz = pB_end - pB_start ; - - // shift into the middle of A(:,jA) and B(:,jB) for the first - // vector of C for this task. - int64_t pA_delta = 0 ; - int64_t pB_delta = 0 ; - if (kC == kC_task && bknz > 0) + #pragma omp for schedule(static) + for (GrB_Index k = 0 ; k < Mask->nvec ; k++) + { + GrB_Index j = GBh_M (Mh, k) ; + + int64_t pA_start = GBp_M (Mp, k, vlen) ; + int64_t pA_end = GBp_M (Mp, k+1, vlen) ; + GrB_Index pos = Mask->p_is_32 ? ((int32_t *)C->p)[j] : ((int64_t *)C->p)[j] ; + for (GrB_Index p = pA_start ; p < pA_end ; p++) + { + if (!GBb_M (Mask->b, p)) continue ; + + int64_t i = GBi_M (Mi, p, vlen) ; + GrB_Index Mrow = Mask->is_csc ? i : j ; GrB_Index Mcol = Mask->is_csc ? j : i ; + + // extract elements from A and B, + // initialize offset in MTi and MTx, + // get result of op, place it in MTx + + if (GB_MASK_STRUCT || (Mask->iso ? ((bool *)Mask->x)[0] : ((bool *)Mask->x)[p])) + { + GrB_Index arow = A_transpose ? (Mcol / bncols) : (Mrow / bnrows); + GrB_Index acol = A_transpose ? (Mrow / bnrows) : (Mcol / bncols); + + GrB_Index brow = B_transpose ? (Mcol % bncols) : (Mrow % bnrows); + GrB_Index bcol = B_transpose ? (Mrow % bnrows) : (Mcol % bncols); + + GB_LOOKUP_XOFFSET (offset, A, arow, acol) ; + if (offset == -1) + { + continue; + } + // iso of A or of C (?) + GB_GETA (a_elem, Ax, offset, GB_A_ISO) ; + + GB_LOOKUP_XOFFSET (offset, B, brow, bcol) ; + if (offset == -1) + { + continue; + } + GB_GETB (b_elem, Bx, offset, GB_B_ISO) ; + + GrB_Index ix, jx, iy, jy ; + ix = A_transpose ? acol : arow ; + jx = A_transpose ? arow : acol ; + iy = B_transpose ? bcol : brow ; + jy = B_transpose ? brow : bcol ; + + if (Mask->i_is_32) + { + ((int32_t *)C->i)[pos] = i ; + } + else + { + ((int64_t *)C->i)[pos] = i ; + } + + GB_KRONECKER_OP (Cx, pos, a_elem, ix, jx, b_elem, iy, jy) ; + + pos++ ; + } + } + } + } + } + else + { + int tid ; + #pragma omp parallel for num_threads(nthreads) schedule(static) + for (tid = 0 ; tid < nthreads ; tid++) + { + + //---------------------------------------------------------------------- + // get the iso values of A and B + //---------------------------------------------------------------------- + + GB_DECLAREA (a) ; + if (GB_A_ISO) + { + GB_GETA (a, Ax, 0, true) ; + } + GB_DECLAREB (b) ; + if (GB_B_ISO) { - pA_delta = pC_delta / bknz ; - pB_delta = pC_delta % bknz ; + GB_GETB (b, Bx, 0, true) ; } - //------------------------------------------------------------------ - // for all entries in A(:,jA), skipping entries for first vector - //------------------------------------------------------------------ + //---------------------------------------------------------------------- + // construct the task to compute Ci,Cx [pC:pC_end-1] + //---------------------------------------------------------------------- - int64_t pA = pA_start + pA_delta ; - pA_delta = 0 ; - for ( ; pA < pA_end && pC < pC_end ; pA++) - { + int64_t pC, pC_end ; + GB_PARTITION (pC, pC_end, cnz, tid, nthreads) ; + + // find where this task starts in C + int64_t kC_task = GB_search_for_vector (Cp, GB_Cp_IS_32, pC, 0, cnvec, + cvlen) ; + int64_t pC_delta = pC - GBp_C (Cp, kC_task, cvlen) ; - //-------------------------------------------------------------- - // a = A(iA,jA), typecasted to op->xtype - //-------------------------------------------------------------- + //---------------------------------------------------------------------- + // compute C(:,kC) for all vectors kC in this task + //---------------------------------------------------------------------- - int64_t iA = GBi_A (Ai, pA, avlen) ; - int64_t iAblock = iA * bvlen ; - if (!GB_A_ISO) + for (int64_t kC = kC_task ; kC < cnvec && pC < pC_end ; kC++) + { + + //------------------------------------------------------------------ + // get the vectors C(:,jC), A(:,jA), and B(:,jB) + //------------------------------------------------------------------ + + // C(:,jC) = kron (A(:,jA), B(:,jB), the (kC)th vector of C, + // where jC = GBh_C (Ch, kC) + int64_t kA = kC / bnvec ; + int64_t kB = kC % bnvec ; + + // get A(:,jA), the (kA)th vector of A + int64_t jA = GBh_A (Ah, kA) ; + int64_t pA_start = GBp_A (Ap, kA, avlen) ; + int64_t pA_end = GBp_A (Ap, kA+1, avlen) ; + + // get B(:,jB), the (kB)th vector of B + int64_t jB = GBh_B (Bh, kB) ; + int64_t pB_start = GBp_B (Bp, kB, bvlen) ; + int64_t pB_end = GBp_B (Bp, kB+1, bvlen) ; + int64_t bknz = pB_end - pB_start ; + + // shift into the middle of A(:,jA) and B(:,jB) for the first + // vector of C for this task. + int64_t pA_delta = 0 ; + int64_t pB_delta = 0 ; + if (kC == kC_task && bknz > 0) { - GB_GETA (a, Ax, pA, false) ; + pA_delta = pC_delta / bknz ; + pB_delta = pC_delta % bknz ; } - //-------------------------------------------------------------- - // for all entries in B(:,jB), skipping entries for 1st vector - //-------------------------------------------------------------- + //------------------------------------------------------------------ + // for all entries in A(:,jA), skipping entries for first vector + //------------------------------------------------------------------ - // scan B(:,jB), skipping to the first entry of C if this is - // the first time B is accessed in this task - int64_t pB = pB_start + pB_delta ; - pB_delta = 0 ; - for ( ; pB < pB_end && pC < pC_end ; pB++) - { + int64_t pA = pA_start + pA_delta ; + pA_delta = 0 ; + for ( ; pA < pA_end && pC < pC_end ; pA++) + { - //---------------------------------------------------------- - // b = B(iB,jB), typecasted to op->ytype - //---------------------------------------------------------- + //-------------------------------------------------------------- + // a = A(iA,jA), typecasted to op->xtype + //-------------------------------------------------------------- - int64_t iB = GBi_B (Bi, pB, bvlen) ; - if (!GB_B_ISO) + int64_t iA = GBi_A (Ai, pA, avlen) ; + int64_t iAblock = iA * bvlen ; + if (!GB_A_ISO) { - GB_GETB (b, Bx, pB, false) ; + GB_GETA (a, Ax, pA, false) ; } - //---------------------------------------------------------- - // C(iC,jC) = A(iA,jA) * B(iB,jB) - //---------------------------------------------------------- + //-------------------------------------------------------------- + // for all entries in B(:,jB), skipping entries for 1st vector + //-------------------------------------------------------------- - if (!GB_C_IS_FULL) - { - // save the row index iC - // Ci [pC] = iAblock + iB ; - GB_ISET (Ci, pC, iAblock + iB) ; - } - // Cx [pC] = op (a, b) - if (!GB_C_ISO) + // scan B(:,jB), skipping to the first entry of C if this is + // the first time B is accessed in this task + int64_t pB = pB_start + pB_delta ; + pB_delta = 0 ; + for ( ; pB < pB_end && pC < pC_end ; pB++) { - GB_KRONECKER_OP (Cx, pC, a, iA, jA, b, iB, jB) ; + + //---------------------------------------------------------- + // b = B(iB,jB), typecasted to op->ytype + //---------------------------------------------------------- + + int64_t iB = GBi_B (Bi, pB, bvlen) ; + if (!GB_B_ISO) + { + GB_GETB (b, Bx, pB, false) ; + } + + //---------------------------------------------------------- + // C(iC,jC) = A(iA,jA) * B(iB,jB) + //---------------------------------------------------------- + + if (!GB_C_IS_FULL) + { + // save the row index iC + // Ci [pC] = iAblock + iB ; + GB_ISET (Ci, pC, iAblock + iB) ; + } + // Cx [pC] = op (a, b) + if (!GB_C_ISO) + { + GB_KRONECKER_OP (Cx, pC, a, iA, jA, b, iB, jB) ; + } + pC++ ; } - pC++ ; } } } } } - From 9aed2ec6ed991aa58bc6ff05dca23b5f8c5240e2 Mon Sep 17 00:00:00 2001 From: Ilya Maltsev Date: Tue, 5 May 2026 00:23:54 +0300 Subject: [PATCH 6/7] fix: fixed codestyle and matrix generation in tests --- Test/test226.m | 5 +-- Test/test227.m | 102 +++++++++++++++++++++---------------------------- 2 files changed, 45 insertions(+), 62 deletions(-) diff --git a/Test/test226.m b/Test/test226.m index 4778425a8a..7e3759128a 100644 --- a/Test/test226.m +++ b/Test/test226.m @@ -9,8 +9,8 @@ A.matrix = sprand (5, 10, 0.4) ; B.matrix = ones (3, 2) ; B.iso = true ; -M.matrix = sprandn (15, 20,0.2) ~= 0 ; -MT.matrix = sprandn (9, 4, 20,0.2) ~= 0 ; +M.matrix = sprand (15, 20, 0.2) ~= 0 ; +MT.matrix = sprand (9, 4, 20, 0.2) ~= 0 ; mult.opname = 'times' ; mult.optype = 'double' ; @@ -42,4 +42,3 @@ GB_spec_compare (C1, C2) ; fprintf ('\ntest226: all tests passed\n') ; - diff --git a/Test/test227.m b/Test/test227.m index 50dd0153d1..c45653c91e 100644 --- a/Test/test227.m +++ b/Test/test227.m @@ -33,18 +33,15 @@ bm = 4 ; bn = 2 ; -Ax_temp = 100 * sprandn (am, an, 0.5); -Bx_temp = 100 * sprandn (bm, bn, 0.5); - -Ax = sparse(round(Ax_temp)); -Bx = sparse(round(Bx_temp)); +Ax = sparse (100 * sprandn (am, an, 0.5)) ; +Bx = sparse (100 * sprandn (bm, bn, 0.5)) ; cm = am * bm ; cn = an * bn ; -Cx = sparse (cm,cn) ; -Maskmat = sprandn (cm,cn,0.2) ~= 0 ; -ATmat = Ax' ; -BTmat = Bx' ; +Cx = sparse (cm, cn) ; +Maskmat = sprandn (cm, cn, 0.2) ~= 0 ; +AT = Ax' ; +BT = Bx' ; for k2 = [4 7 45:52 ] for k1 = 1:4 @@ -71,26 +68,16 @@ clear A A.matrix = Ax ; A.is_hyper = A_is_hyper ; - A.is_csc = A_is_csc ; + A.is_csc = A_is_csc ; clear B B.matrix = Bx ; B.is_hyper = B_is_hyper ; - B.is_csc = B_is_csc ; + B.is_csc = B_is_csc ; clear C - C.matrix = sparse (cm,cn) ; - C.is_csc = C_is_csc ; - - clear AT - AT.matrix = ATmat ; - AT.is_hyper = A_is_hyper ; - AT.is_csc = A.is_csc ; - - clear BT - BT.matrix = BTmat ; - BT.is_hyper = B_is_hyper ; - BT.is_csc = B.is_csc ; + C.matrix = Cx ; + C.is_csc = C_is_csc ; %--------------------------------------- % kron(A,B) @@ -130,41 +117,39 @@ % tests with Mask for Mask_is_hyper = 0:1 - for Mask_is_csc = 0:1 - fprintf('*') - - A.is_csc = A_is_csc ; - B.is_csc = B_is_csc ; - A.is_hyper = A_is_hyper ; - B.is_hyper = B_is_hyper ; - - clear M - M.matrix = Maskmat ; - M.is_hyper = Mask_is_hyper ; - M.is_csc = Mask_is_csc; - C.is_csc = C_is_csc ; - - % kron(A, B) with Mask - C0 = GB_spec_kron (C, M, [ ], op, A, B, dnn) ; - fprintf('#') ; - C1 = GB_mex_kron (C, M, [ ], op, A, B, dnn) ; - GB_spec_compare(C0, C1) ; - - % kron(A', B) with Mask - C0 = GB_spec_kron (C, M, [ ], op, AT, B, dtn) ; - C1 = GB_mex_kron (C, M, [ ], op, AT, B, dtn) ; - GB_spec_compare (C0, C1) ; - - % kron(A, B') with Mask - C0 = GB_spec_kron (C, M, [ ], op, A, BT, dnt) ; - C1 = GB_mex_kron (C, M, [ ], op, A, BT, dnt) ; - GB_spec_compare (C0, C1) ; - - % kron(A', B') with Mask - C0 = GB_spec_kron (C, M, [ ], op, AT, BT, dtt) ; - C1 = GB_mex_kron (C, M, [ ], op, AT, BT, dtt) ; - GB_spec_compare (C0, C1) ; - end + for Mask_is_csc = 0:1 + + A.is_csc = A_is_csc ; + B.is_csc = B_is_csc ; + A.is_hyper = A_is_hyper ; + B.is_hyper = B_is_hyper ; + + clear M + M.matrix = Maskmat ; + M.is_hyper = Mask_is_hyper ; + M.is_csc = Mask_is_csc; + C.is_csc = C_is_csc ; + + % kron(A, B) with Mask + C0 = GB_spec_kron (C, M, [ ], op, A, B, dnn) ; + C1 = GB_mex_kron (C, M, [ ], op, A, B, dnn) ; + GB_spec_compare(C0, C1) ; + + % kron(A', B) with Mask + C0 = GB_spec_kron (C, M, [ ], op, AT, B, dtn) ; + C1 = GB_mex_kron (C, M, [ ], op, AT, B, dtn) ; + GB_spec_compare (C0, C1) ; + + % kron(A, B') with Mask + C0 = GB_spec_kron (C, M, [ ], op, A, BT, dnt) ; + C1 = GB_mex_kron (C, M, [ ], op, A, BT, dnt) ; + GB_spec_compare (C0, C1) ; + + % kron(A', B') with Mask + C0 = GB_spec_kron (C, M, [ ], op, AT, BT, dtt) ; + C1 = GB_mex_kron (C, M, [ ], op, AT, BT, dtt) ; + GB_spec_compare (C0, C1) ; + end end end end @@ -189,4 +174,3 @@ GB_spec_compare (C0, C1) ; fprintf ('\ntest227: all tests passed\n') ; - From 131c9233ec4aaa0afc7b21dc6bb450eac32f7e6f Mon Sep 17 00:00:00 2001 From: Ilya Maltsev Date: Tue, 5 May 2026 00:33:56 +0300 Subject: [PATCH 7/7] feat: C has its own bitness control for array for cases when Mask uses extra 64 bit integers --- Source/kronecker/GB_kron.c | 1 - Source/kronecker/GB_kroner.c | 140 ++++++++++++------ .../kronecker/template/GB_kroner_template.c | 13 +- 3 files changed, 95 insertions(+), 59 deletions(-) diff --git a/Source/kronecker/GB_kron.c b/Source/kronecker/GB_kron.c index 7e414d7fbe..d3f61a6d5b 100644 --- a/Source/kronecker/GB_kron.c +++ b/Source/kronecker/GB_kron.c @@ -167,7 +167,6 @@ GrB_Info GB_kron // C = accum (C, kron(A,B)) GB_RETURN_IF_QUICK_MASK (C, C_replace, M, Mask_comp, Mask_struct) ; // check if it's possible to apply mask immediately in kron - // TODO: MT should have its own 32/64 bitness controls bool Mask_is_applicable = M != NULL && !Mask_comp ; if (Mask_is_applicable) { diff --git a/Source/kronecker/GB_kroner.c b/Source/kronecker/GB_kroner.c index fabfd2ceb2..4843bb09ff 100644 --- a/Source/kronecker/GB_kroner.c +++ b/Source/kronecker/GB_kroner.c @@ -274,8 +274,13 @@ GrB_Info GB_kroner // allocate Cp //---------------------------------------------------------------------- + int64_t mnzmax = GB_nnz_max (Mask) ; + bool Cp_is_32, Cj_is_32, Ci_is_32; + GB_determine_pji_is_32 (&Cp_is_32, &Cj_is_32, &Ci_is_32, + GxB_SPARSE, mnzmax, (int64_t) Mask->vlen, (int64_t) Mask->vdim, Werk) ; + uint32_t *Cp32 = NULL ; uint64_t *Cp64 = NULL ; - if (Mask->p_is_32) + if (Cp_is_32) Cp32 = GB_calloc_memory (Mask->vdim + 1, sizeof (uint32_t), &allocated) ; else @@ -349,8 +354,14 @@ GrB_Info GB_kroner if (!GB_lookup_xoffset (&offset, B, brow, bcol)) continue ; - if (Mask->p_is_32) { (Cp32 [j])++ ; } - else { (Cp64 [j])++ ; } + if (Cp_is_32) + { + (Cp32 [j])++ ; + } + else + { + (Cp64 [j])++ ; + } nonempty = true ; } } @@ -362,30 +373,34 @@ GrB_Info GB_kroner // prefix sum to get centries //---------------------------------------------------------------------- - if (Mask->p_is_32) - GB_cumsum (Cp32, Mask->p_is_32, Mask->vdim, NULL, nthreads, Werk) ; + if (Cp_is_32) + GB_cumsum (Cp32, Cp_is_32, Mask->vdim, NULL, nthreads, Werk) ; else - GB_cumsum (Cp64, Mask->p_is_32, Mask->vdim, NULL, nthreads, Werk) ; + GB_cumsum (Cp64, Cp_is_32, Mask->vdim, NULL, nthreads, Werk) ; - centries = Mask->p_is_32 ? (int64_t) Cp32 [Mask->vdim] - : (int64_t) Cp64 [Mask->vdim] ; + centries = Cp_is_32 ? (int64_t) Cp32 [Mask->vdim] + : (int64_t) Cp64 [Mask->vdim] ; //---------------------------------------------------------------------- // allocate Ci //---------------------------------------------------------------------- uint32_t *Ci32 = NULL ; uint64_t *Ci64 = NULL ; - if (Mask->i_is_32) + if (Ci_is_32) Ci32 = GB_malloc_memory (centries, sizeof (uint32_t), &allocated) ; else Ci64 = GB_malloc_memory (centries, sizeof (uint64_t), &allocated) ; if (centries > 0 && Ci32 == NULL && Ci64 == NULL) { - if (Mask->p_is_32) GB_free_memory (&Cp32, - (Mask->vdim + 1) * sizeof (uint32_t)) ; - else GB_free_memory (&Cp64, - (Mask->vdim + 1) * sizeof (uint64_t)) ; + if (Cp_is_32) + { + GB_free_memory (&Cp32, (Mask->vdim + 1) * sizeof (uint32_t)) ; + } + else + { + GB_free_memory (&Cp64, (Mask->vdim + 1) * sizeof (uint64_t)) ; + } GB_FREE_WORKSPACE ; return GrB_OUT_OF_MEMORY ; } @@ -400,14 +415,22 @@ GrB_Info GB_kroner Cx = GB_malloc_memory (1, op->ztype->size, &allocated) ; if (Cx == NULL) { - if (Mask->i_is_32) GB_free_memory (&Ci32, - centries * sizeof (uint32_t)) ; - else GB_free_memory (&Ci64, - centries * sizeof (uint64_t)) ; - if (Mask->p_is_32) GB_free_memory (&Cp32, - (Mask->vdim + 1) * sizeof (uint32_t)) ; - else GB_free_memory (&Cp64, - (Mask->vdim + 1) * sizeof (uint64_t)) ; + if (Ci_is_32) + { + GB_free_memory (&Ci32, centries * sizeof (uint32_t)) ; + } + else + { + GB_free_memory (&Ci64, centries * sizeof (uint64_t)) ; + } + if (Cp_is_32) + { + GB_free_memory (&Cp32, (Mask->vdim + 1) * sizeof (uint32_t)) ; + } + else + { + GB_free_memory (&Cp64, (Mask->vdim + 1) * sizeof (uint64_t)) ; + } GB_FREE_WORKSPACE ; return GrB_OUT_OF_MEMORY ; } @@ -418,14 +441,22 @@ GrB_Info GB_kroner Cx = GB_malloc_memory (centries, op->ztype->size, &allocated) ; if (centries > 0 && Cx == NULL) { - if (Mask->i_is_32) GB_free_memory (&Ci32, - centries * sizeof (uint32_t)) ; - else GB_free_memory (&Ci64, - centries * sizeof (uint64_t)) ; - if (Mask->p_is_32) GB_free_memory (&Cp32, - (Mask->vdim + 1) * sizeof (uint32_t)) ; - else GB_free_memory (&Cp64, - (Mask->vdim + 1) * sizeof (uint64_t)) ; + if (Ci_is_32) + { + GB_free_memory (&Ci32, centries * sizeof (uint32_t)) ; + } + else + { + GB_free_memory (&Ci64, centries * sizeof (uint64_t)) ; + } + if (Cp_is_32) + { + GB_free_memory (&Cp32, (Mask->vdim + 1) * sizeof (uint32_t)) ; + } + else + { + GB_free_memory (&Cp64, (Mask->vdim + 1) * sizeof (uint64_t)) ; + } GB_FREE_WORKSPACE ; return GrB_OUT_OF_MEMORY ; } @@ -438,19 +469,33 @@ GrB_Info GB_kroner GrB_Info Calloc = GB_new_bix (&C, op->ztype, vlen, Mask->vdim, GB_ph_null, Mask->is_csc, GxB_SPARSE, false, Mask->hyper_switch, Mask->vdim, centries, false, C_iso, - Mask->p_is_32, Mask->j_is_32, Mask->i_is_32) ; + Cp_is_32, Cj_is_32, Ci_is_32) ; if (Calloc != GrB_SUCCESS) { - if (C_iso) GB_free_memory (&Cx, op->ztype->size) ; - else GB_free_memory (&Cx, centries * op->ztype->size) ; - if (Mask->i_is_32) GB_free_memory (&Ci32, - centries * sizeof (uint32_t)) ; - else GB_free_memory (&Ci64, - centries * sizeof (uint64_t)) ; - if (Mask->p_is_32) GB_free_memory (&Cp32, - (Mask->vdim + 1) * sizeof (uint32_t)) ; - else GB_free_memory (&Cp64, - (Mask->vdim + 1) * sizeof (uint64_t)) ; + if (C_iso) + { + GB_free_memory (&Cx, op->ztype->size) ; + } + else + { + GB_free_memory (&Cx, centries * op->ztype->size) ; + } + if (Ci_is_32) + { + GB_free_memory (&Ci32, centries * sizeof (uint32_t)) ; + } + else + { + GB_free_memory (&Ci64, centries * sizeof (uint64_t)) ; + } + if (Cp_is_32) + { + GB_free_memory (&Cp32, (Mask->vdim + 1) * sizeof (uint32_t)) ; + } + else + { + GB_free_memory (&Cp64, (Mask->vdim + 1) * sizeof (uint64_t)) ; + } GB_FREE_WORKSPACE ; return Calloc ; } @@ -458,16 +503,15 @@ GrB_Info GB_kroner GB_free_memory (&C->i, C->i_size) ; GB_free_memory (&C->x, C->x_size) ; - C->p = Mask->p_is_32 ? (void *) Cp32 : (void *) Cp64 ; - C->i = Mask->i_is_32 ? (void *) Ci32 : (void *) Ci64 ; + C->p = Cp_is_32 ? (void *) Cp32 : (void *) Cp64 ; + C->i = Ci_is_32 ? (void *) Ci32 : (void *) Ci64 ; C->x = Cx ; - C->p_size = (Mask->p_is_32 ? sizeof (uint32_t) - : sizeof (uint64_t)) - * (Mask->vdim + 1) ; - C->i_size = (Mask->i_is_32 ? sizeof (uint32_t) - : sizeof (uint64_t)) * centries ; + C->p_size = (Cp_is_32 ? sizeof (uint32_t) + : sizeof (uint64_t)) * (Mask->vdim + 1) ; + C->i_size = (Ci_is_32 ? sizeof (uint32_t) + : sizeof (uint64_t)) * centries ; C->x_size = C->iso ? op->ztype->size - : op->ztype->size * centries ; + : op->ztype->size * centries ; C->magic = GB_MAGIC ; C->nvals = centries ; C->nvec_nonempty = (int64_t) nvecs ; diff --git a/Source/kronecker/template/GB_kroner_template.c b/Source/kronecker/template/GB_kroner_template.c index c306df0058..1520564a69 100644 --- a/Source/kronecker/template/GB_kroner_template.c +++ b/Source/kronecker/template/GB_kroner_template.c @@ -186,8 +186,8 @@ int64_t pA_start = GBp_M (Mp, k, vlen) ; int64_t pA_end = GBp_M (Mp, k+1, vlen) ; - GrB_Index pos = Mask->p_is_32 ? ((int32_t *)C->p)[j] : ((int64_t *)C->p)[j] ; - for (GrB_Index p = pA_start ; p < pA_end ; p++) + GrB_Index pos = GB_IGET(Cp, j); + for (GrB_Index p = pA_start; p < pA_end; p++) { if (!GBb_M (Mask->b, p)) continue ; @@ -227,14 +227,7 @@ iy = B_transpose ? bcol : brow ; jy = B_transpose ? brow : bcol ; - if (Mask->i_is_32) - { - ((int32_t *)C->i)[pos] = i ; - } - else - { - ((int64_t *)C->i)[pos] = i ; - } + GB_ISET (Ci, pos, i) ; GB_KRONECKER_OP (Cx, pos, a_elem, ix, jx, b_elem, iy, jy) ;