26 #define gs PREFIXED_NAME(gs )
27 #define gs_vec PREFIXED_NAME(gs_vec )
28 #define gs_many PREFIXED_NAME(gs_many )
29 #define gs_setup PREFIXED_NAME(gs_setup )
30 #define gs_free PREFIXED_NAME(gs_free )
31 #define gs_unique PREFIXED_NAME(gs_unique)
41 void *
out,
const void *in,
const unsigned vn,
46 void *
out,
const void *in,
const unsigned vn,
51 void *
out,
const unsigned vn,
102 end->
flag = id_i!=abs_id;
108 for(row=nz->
ptr,end=row+nz->
n;row!=end;++row) {
109 ulong this_id = row->id;
110 if(this_id!=last_id) primary = row->i;
126 for(nz_row=nz->
ptr,nz_end=nz_row+nz->
n;nz_row!=nz_end;++nz_row) {
127 if(nz_row->
i != nz_row->
primary)
continue;
128 un_row->
id = nz_row->
id;
148 #define FLAGS_LOCAL 1
149 #define FLAGS_REMOTE 2
177 for(w=wa->
ptr,we=w+wa->
n;w!=we;++w) {
179 uint i1 = ~i1f<i1f?~i1f:i1f, i2 = ~i2f<i2f?~i2f:i2f;
185 p->
id=last_id, p->
ord=w->
ord, p->
i=i1, p->
flag=(i1f^i1)&FLAGS_LOCAL;
198 ulong ordinal[2], n_shared=0, scan_buf[2];
209 for(un_row=un.
ptr,un_end=un_row+un.
n;un_row!=un_end;++un_row) {
212 if(
id==last_id)
continue;
214 if(other!=un_end&&other->
id==
id) last_id=
id, ++n_shared;
223 for(un_row=un.
ptr,un_end=un_row+un.
n;un_row!=un_end;++un_row) {
227 for(other=un_row+1;other!=un_end&&other->
id==
id;++other) {
230 if(
id!=last_id) last_id=
id, ++ordinal[0];
259 struct array *
const nz=&top->
nz, *
const sh=&top->
sh, *
const pr=&top->
pr;
268 for(p=nz->
ptr,e=p+nz->
n;p!=e;++p)
272 for(p=nz->
ptr,e=p+nz->
n;p!=e;++p)
286 out=sh->ptr; pnz=top->
nz.
ptr;
287 for(pb=sh->ptr,e=pb+sh->n;pb!=e;pb=pe) {
289 while(pnz->
i!=i) ++pnz;
294 owner = pb->
id%(lt+gt+1);
295 if(owner==lt)
goto make_sh_unique_mine;
296 if(owner>lt) --owner;
298 owner = pb->
id%(lt+gt);
311 sh->n = out - ((
struct shared_id*)sh->ptr);
318 for(pb=sh->ptr,e=pb+sh->n;pb!=e;pb=pe) {
320 pe=pb;
while(pe->
i==i) ++pe;
321 if(q->
id!=pb->
id) printf(
"FAIL!!!\n");
335 uint *map, *
p, count = 1;
337 #define DO_COUNT(cond) do \
338 for(row=nz->ptr,end=row+nz->n;row!=end;) { \
339 ulong row_id = row->id; int any=0; \
340 for(other=row+1;other!=end&&other->id==row_id&&cond;++other) \
342 count+=any, row=other; \
347 #define DO_SET(cond) do \
348 for(row=nz->ptr,end=row+nz->n;row!=end;) { \
349 ulong row_id = row->id; int any=0; \
351 for(other=row+1;other!=end&&other->id==row_id&&cond;++other) \
352 any=1, *p++ = other->i; \
353 if(any) *p++ = -(uint)1; else --p; \
364 uint *map, *
p, count=1;
366 for(row=nz->
ptr,end=row+nz->
n;row!=end;++row)
369 for(row=nz->
ptr,end=row+nz->
n;row!=end;++row)
416 for(p=c->
p,pe=p+c->
n;p!=pe;++p) {
417 size_t len = *(size++)*unit_size;
429 for(p=c->
p,pe=p+c->
n;p!=pe;++p) {
430 size_t len = *(size++)*unit_size;
441 const struct pw_data *pwd = execdata;
447 unsigned unit_size = vn*gs_dom_size[
dom];
452 scatter_to_buf[mode](sendbuf,data,vn,pwd->
map[send],
dom);
458 gather_from_buf[mode](data,buf,vn,pwd->
map[recv],
dom,op);
465 const unsigned flags_mask,
buffer *buf)
467 uint n=0,count=0, lp=-(
uint)1, mem_size=0;
472 for(s=sh->
ptr,se=s+sh->
n;s!=se;++s) {
473 if(s->
flags&flags_mask) { s->
bi = -(
uint)1;
continue; }
475 if(s->
p!=lp) lp=s->
p, ++
n;
482 for(s=sh->
ptr,se=s+sh->
n;s!=se;++s) {
483 if(s->
flags&flags_mask)
continue;
486 if(n!=0) data->
size[n-1] = count;
487 count=0, data->
p[n++]=lp;
491 if(n!=0) data->
size[n-1] = count;
500 uint count=0, *map, *
p;
505 for(s=sh->
ptr,se=s+sh->
n;s!=se;) {
507 if(s->
bi==-(
uint)1) { ++s;
continue; }
509 for(++s;s!=se&&s->
i==
i;++s)
if(s->
bi!=-(
uint)1) ++count;
513 for(s=sh->
ptr,se=s+sh->
n;s!=se;) {
515 if(s->
bi==-(
uint)1) { ++s;
continue; }
516 *p++ =
i, *p++ = s->
bi;
517 for(++s;s!=se&&s->
i==
i;++s)
if(s->
bi!=-(
uint)1) *p++ = s->
bi;
528 *mem_size =
sizeof(
struct pw_data);
585 const struct cr_data *crd = execdata;
598 char *sendbuf, *buf_old, *buf_new;
603 for(k=0;k<nstages;++k) {
607 stage[k].
p1, comm->
np+k);
608 if(stage[k].nrecvn==2)
609 comm_irecv(&req[2],comm,buf_new+unit_size*stage[k].size_r1,
610 unit_size*stage[k].
size_r2, stage[k].
p2, comm->
np+k);
611 sendbuf = buf_new+unit_size*stage[k].
size_r;
613 scatter_user_to_buf[mode](sendbuf,data,vn,stage[0].
scatter_map,
dom);
615 scatter_buf_to_buf[mode](sendbuf,buf_old,vn,stage[k].
scatter_map,
dom),
616 gather_buf_to_buf [mode](sendbuf,buf_old,vn,stage[k].
gather_map ,dom,op);
619 stage[k].p1, comm->
np+k);
621 {
char *t = buf_old; buf_old=buf_new; buf_new=t; }
623 scatter_buf_to_user[mode](data,buf_old,vn,stage[k].
scatter_map,
dom);
624 gather_buf_to_user [mode](data,buf_old,vn,stage[k].
gather_map ,
dom,op);
637 uint nl = (
n+1)/2, bh = bl+nl;
638 if(
id<bh)
n=nl;
else n-=nl,bl=bh;
644 mem_size += 2*(k+1)*
sizeof(
struct cr_stage);
645 bl=0,
n=comm->
np, k=0;
647 uint nl = (
n+1)/2, bh = bl+nl;
648 uint targ;
unsigned recvn;
649 recvn = 1, targ =
n-1-(
id-bl)+bl;
650 if(
id==targ) targ=bh, recvn=0;
651 if(
n&1 &&
id==bh) recvn=2;
655 if(
id<bh)
n=nl;
else n-=nl,bl=bh;
667 const unsigned send_mask,
uint this_p)
670 uint last_i=-(
uint)1;
int added_myself;
671 uint cw_n = 0, cw_max = cw->
max;
675 #define CW_ADD(aid,ap,ari,asi) do { \
677 array_reserve(struct crl_id,cw,cw_n+1),cw_max=cw->max, \
678 w=(struct crl_id*)cw->ptr+cw_n; \
679 w->id=aid, w->p=ap, w->ri=ari, w->si=asi; \
683 for(s=sh->
ptr,se=s+sh->
n;s!=se;++s) {
684 int send = (s->
flags&send_mask)==0;
685 int recv = (s->
flags&recv_mask)==0;
686 if(s->
i!=last_i) last_i=s->
i, added_myself=0;
700 struct crl_id *w, *we, *other;
701 uint scount=1, gcount=1, *sp, *gp;
703 for(w=cw->
ptr,we=w+cw->
n;w!=we;w=other) {
706 for(other=w+1;other!=we&&other->
bi==
bi;++other)
707 if(other->
si!=
si)
si=other->
si, any=2, ++gcount;
712 mem_size += (scount+gcount)*
sizeof(
uint);
713 for(w=cw->
ptr,we=w+cw->
n;w!=we;w=other) {
715 *sp++ = w->
si, *sp++ =
bi;
717 for(other=w+1;other!=we&&other->
bi==
bi;++other)
718 if(other->
si!=
si)
si=other->
si, any=1, *gp++ =
si;
719 if(any) *gp++ = -(
uint)1;
else --gp;
730 struct crl_id *w, *we, *start;
731 uint nsend, nkeep = 0, nks = 0,
bi=0;
733 if(send_hi)
for(w=cw->
ptr,we=w+cw->
n;w!=we;++w) w->
send = w->
p< cutoff;
734 else for(w=cw->
ptr,we=w+cw->
n;w!=we;++w) w->
send = w->
p>=cutoff;
736 for(start=cw->
ptr,w=start,we=w+cw->
n;w!=we;++w) {
738 if(w->
id!=start->
id) start=w;
744 for(start=cw->
ptr,w=start,we=w+nsend+nks;w!=we;++w) {
745 if(w->
id!=start->
id) start=w, ++
bi;
749 stage->
size_s = nsend+nks==0 ? 0 :
bi+1;
750 for(we=(
struct crl_id*)cw->
ptr+cw->
n;w!=we;++w) {
751 if(w->
id!=start->
id) start=w, ++
bi;
756 *mem_size +=
crl_maps(stage,cw,buf);
761 for(;
n;--
n) w->
si=w->
bi+v, ++w;
765 for(;
n;--
n) w->
bi=w->
ri, ++w;
777 uint nl = (
n+1)/2, bh = bl+nl;
778 uint nkeep, nsend[2], nrecv[2][2] = {{0,0},{0,0}};
779 struct crl_id *wrecv[2], *wsend;
782 nkeep = cw->
n - nsend[0];
797 wrecv[0] = cw->
ptr, wrecv[0] += cw->
n, wrecv[1] = wrecv[0]+nrecv[0][0];
798 wsend = cw->
ptr, wsend += nkeep;
812 memmove(wsend,wrecv[0],(nrecv[0][0]+nrecv[1][0])*
sizeof(
struct crl_id));
813 cw->
n += nrecv[0][0] + nrecv[1][0];
816 if(
id<bh)
n=nl;
else n-=nl,bl=bh;
820 *mem_size +=
crl_maps(stage,cw,buf);
830 *mem_size =
sizeof(
struct cr_data);
854 for(k=0; k<kmax; ++k) {
865 free(data->
stage[0]);
897 unsigned unit_size = gs_dom_size[
dom];
899 ardbuf = buf+unit_size*gvn;
913 struct array *pr,
const unsigned flags_mask,
int to_buf,
uint *mem_size)
916 uint count=1, *map, *m;
917 for(p=pr->
ptr,pe=p+pr->
n;p!=pe;++p)
918 if((p->
flag&flags_mask)==0) count+=3;
921 for(p=pr->
ptr,pe=p+pr->
n;p!=pe;++p)
922 if((p->
flag&flags_mask)==0)
923 *m++ = p->
i, *m++ = p->
ord, *m++ = -(
uint)1;
925 for(p=pr->
ptr,pe=p+pr->
n;p!=pe;++p)
926 if((p->
flag&flags_mask)==0)
927 *m++ = p->
ord, *m++ = p->
i, *m++ = -(
uint)1;
987 times[0] = t/comm->
np, times[1] = t, times[2] = t;
999 const char *
name =
"pairwise";
1003 #define DRY_RUN(i,gsr,str) do { \
1004 if(comm->id==0) printf(" " str ": "); \
1005 dry_run_time(time[i],gsr,comm,buf); \
1007 printf("%g %g %g\n",time[i][0],time[i][1],time[i][2]); \
1010 #define DRY_RUN_CHECK(str,new_name) do { \
1011 DRY_RUN(1,&r_alt,str); \
1012 if(time[1][2]<time[0][2]) \
1013 time[0][2]=time[1][2], name=new_name, \
1014 r->fin(r->data), *r = r_alt; \
1016 r_alt.fin(r_alt.data); \
1019 DRY_RUN(0, r,
"pairwise times (avg, min, max)");
1029 #undef DRY_RUN_CHECK
1032 if(comm->
id==0) printf(
" used all_to_all method: %s\n",name);
1098 int unique,
gs_method method,
int verbose)
1100 static setup_fun *
const remote_setup[] =
1114 if(verbose && gsh->
comm.
id==0)
1115 printf(
"gs_setup: %ld unique labels shared\n",(
long)top.
total_shared);
1117 remote_setup[method](&gsh->
r, &top,&gsh->
comm,&cr.
data);
1121 double avg[2],td[2];
uint min[2],max[2],ti[2];
1123 avg[1] = min[1] = max[1] =
sizeof(double)*gsh->
r.
buffer_size;
1129 printf(
" " "handle bytes (avg, min, max)" ": " "%g %u %u\n",
1130 avg[0], (
unsigned)min[0], (
unsigned)max[0]);
1131 printf(
" " "buffer bytes (avg, min, max)" ": " "%g %u %u\n",
1132 avg[1], (
unsigned)min[1], (
unsigned)max[1]);
1141 int unique,
gs_method method,
int verbose)
1180 #define cgs PREFIXED_NAME(gs )
1181 #define cgs_vec PREFIXED_NAME(gs_vec )
1182 #define cgs_many PREFIXED_NAME(gs_many )
1183 #define cgs_setup PREFIXED_NAME(gs_setup)
1184 #define cgs_free PREFIXED_NAME(gs_free )
1186 #define fgs_setup_pick FORTRAN_NAME(gs_setup_pick,GS_SETUP_PICK)
1187 #define fgs_setup FORTRAN_NAME(gs_setup ,GS_SETUP )
1188 #define fgs FORTRAN_NAME(gs_op ,GS_OP )
1189 #define fgs_vec FORTRAN_NAME(gs_op_vec ,GS_OP_VEC )
1190 #define fgs_many FORTRAN_NAME(gs_op_many ,GS_OP_MANY )
1191 #define fgs_fields FORTRAN_NAME(gs_op_fields ,GS_OP_FIELDS )
1192 #define fgs_free FORTRAN_NAME(gs_free ,GS_FREE )
1220 fail(1,__FILE__,line,
"%s: invalid handle", func);
1226 const char *func,
unsigned line)
1229 fail(1,__FILE__,line,
"%s: datatype %d not in valid range 1-3",func,dom);
1231 fail(1,__FILE__,line,
"%s: op %d not in valid range 1-4",func,op);
1239 cgs(u,
fgs_dom[*dom],(gs_op_t)(*op-1),*transpose!=0,fgs_info[*handle],0);
1247 fgs_info[*handle],0);
1251 void *u4,
void *u5,
void *u6,
const sint *
n,
1255 uu[0]=u1,uu[1]=u2,uu[2]=u3,uu[3]=u4,uu[4]=u5,uu[5]=u6;
1257 cgs_many((
void *
const*)uu,*n,
fgs_dom[*dom],(gs_op_t)(*op-1),*transpose!=0,
1258 fgs_info[*handle],0);
1264 void *u,
const sint *stride,
const sint *
n,
1275 p = fgs_fields_array.
ptr;
1276 offset = *stride * gs_dom_size[*dom-1];
1277 for(i=*n;
i;--
i) *p++ = u, u = (
char*)u + offset;
1280 fgs_dom[*dom],(gs_op_t)(*op-1),
1281 *transpose!=0, fgs_info[*handle],0);
1288 fgs_info[*handle] = 0;
static void pw_comm_free(struct pw_comm_data *data)
static void allreduce_exec(void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, unsigned transpose, const void *execdata, const struct comm *comm, char *buf)
static void comm_barrier(const struct comm *c)
const uint * map_local[2]
static void crl_work_init(struct array *cw, struct array *sh, const unsigned send_mask, uint this_p)
#define tmalloc(type, count)
struct cr_stage * stage[2]
static void cr_exec(void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, unsigned transpose, const void *execdata, const struct comm *comm, char *buf)
static char * pw_exec_recvs(char *buf, const unsigned unit_size, const struct comm *comm, const struct pw_comm_data *c, comm_req *req)
subroutine transpose(a, lda, b, ldb)
#define DRY_RUN_CHECK(str, new_name)
#define array_reserve(T, a, min)
static char * pw_exec_sends(char *buf, const unsigned unit_size, const struct comm *comm, const struct pw_comm_data *c, comm_req *req)
#define sarray_sort_2(T, A, n, field1, is_long1, field2, is_long2, buf)
void exec_fun(void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, unsigned transpose, const void *execdata, const struct comm *comm, char *buf)
static void pw_exec(void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, unsigned transpose, const void *execdata, const struct comm *comm, char *buf)
static void cr_free(struct cr_data *data)
static void comm_wait(comm_req *req, int n)
static void pw_free(struct pw_data *data)
#define CW_ADD(aid, ap, ari, asi)
static void init_noop(void *out, const unsigned vn, const uint *map, gs_dom dom, gs_op op)
void gs_gather_fun(void *out, const void *in, const unsigned vn, const uint *map, gs_dom dom, gs_op op)
static uint cr_schedule(struct cr_data *data, const struct comm *comm)
void comm_allreduce(const struct comm *com, gs_dom dom, gs_op op, void *v, uint vn, void *buf)
static void pw_setup(struct gs_remote *r, struct gs_topology *top, const struct comm *comm, buffer *buf)
const uint * map_to_buf[2]
static void nonzero_ids(struct array *nz, const slong *id, const uint n, buffer *buf)
#define trealloc(type, ptr, count)
#define DRY_RUN(i, gsr, str)
static ulong shared_ids(struct array *sh, struct array *pr, const struct array *nz, struct crystal *cr)
static void gs_topology_free(struct gs_topology *top)
static double comm_time(void)
static buffer static_buffer
struct pw_comm_data comm[2]
static void comm_free(struct comm *c)
void comm_scan(void *scan, const struct comm *com, gs_dom dom, gs_op op, const void *v, uint vn, void *buffer)
static void crl_bi_to_si(struct crl_id *w, uint n, uint v)
static const uint * allreduce_map_setup(struct array *pr, const unsigned flags_mask, int to_buf, uint *mem_size)
static struct pw_data * pw_setup_aux(struct array *sh, buffer *buf, uint *mem_size)
static struct allreduce_data * allreduce_setup_aux(struct array *pr, ulong total_shared, uint *mem_size)
static const gs_dom fgs_dom[4]
static void unique_ids(struct array *un, const struct array *nz, const uint np)
#define buffer_reserve(b, max)
static void auto_setup(struct gs_remote *r, struct gs_topology *top, const struct comm *comm, buffer *buf)
static void fgs_check_handle(sint handle, const char *func, unsigned line)
#define comm_init_check(c, ce, np)
static void dry_run_time(double times[3], const struct gs_remote *r, const struct comm *comm, buffer *buf)
static uint crl_work_label(struct array *cw, struct cr_stage *stage, uint cutoff, int send_hi, buffer *buf, uint *mem_size)
static void allreduce_free(struct allreduce_data *ard)
static uint pw_comm_setup(struct pw_comm_data *data, struct array *sh, const unsigned flags_mask, buffer *buf)
static void allreduce_setup(struct gs_remote *r, struct gs_topology *top, const struct comm *comm, buffer *buf)
static void gather_noop(void *out, const void *in, const unsigned vn, const uint *map, gs_dom dom, gs_op op)
const uint * map_from_buf[2]
#define array_init(T, a, max)
static struct gs_data ** fgs_info
#define array_resize(T, a, max)
static void get_topology(struct gs_topology *top, const slong *id, uint n, struct crystal *cr)
#define sarray_sort(T, A, n, field, is_long, buf)
#define gs_scatter_vec_to_many
static void comm_irecv(comm_req *req, const struct comm *c, void *p, size_t n, uint src, int tag)
static void gs_setup_aux(struct gs_data *gsh, const slong *id, uint n, int unique, gs_method method, int verbose)
static uint cr_learn(struct array *cw, struct cr_stage *stage, const struct comm *comm, buffer *buf, uint *mem_size)
static uint local_setup(struct gs_data *gsh, const struct array *nz)
void gs_scatter_fun(void *out, const void *in, const unsigned vn, const uint *map, gs_dom dom)
static void scatter_noop(void *out, const void *in, const unsigned vn, const uint *map, gs_dom dom)
static void shared_ids_aux(struct array *sh, struct array *pr, uint pr_n, struct array *wa, buffer *buf)
static void crl_ri_to_bi(struct crl_id *w, uint n)
static uint crl_maps(struct cr_stage *stage, struct array *cw, buffer *buf)
void gs_init_fun(void *out, const unsigned vn, const uint *map, gs_dom dom, gs_op op)
const uint * flagged_primaries
#define GS_DEFINE_DOM_SIZES()
#define sarray_transfer(T, A, proc_field, set_src, cr)
static struct array fgs_fields_array
static struct cr_data * cr_setup_aux(struct array *sh, const struct comm *comm, buffer *buf, uint *mem_size)
establishes some macros to establish naming conventions
static const uint * pw_map_setup(struct array *sh, buffer *buf, uint *mem_size)
static void cr_free_stage_maps(struct cr_stage *stage, unsigned kmax)
static void fgs_check_parms(sint handle, sint dom, sint op, const char *func, unsigned line)
#define gs_gather_vec_to_many
static void make_topology_unique(struct gs_topology *top, slong *id, uint pid, buffer *buf)
static void cr_setup(struct gs_remote *r, struct gs_topology *top, const struct comm *comm, buffer *buf)
void setup_fun(struct gs_remote *r, struct gs_topology *top, const struct comm *comm, buffer *buf)
static void comm_isend(comm_req *req, const struct comm *c, void *p, size_t n, uint dst, int tag)
static char name[MAX_NAME+1]
static void gs_aux(void *u, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, unsigned transpose, struct gs_data *gsh, buffer *buf)
static const uint * flagged_primaries_map(const struct array *nz, uint *mem_size)
static const uint * local_map(const struct array *nz, const int ignore_flagged, uint *mem_size)
void fail(int status, const char *file, unsigned line, const char *fmt,...)
#define gs_scatter_many_to_vec