Nek5000
SEM for Incompressible NS
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Modules Pages
gs.c
Go to the documentation of this file.
1 
5 #include <stdio.h>
6 
7 #include <stddef.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include "c99.h"
11 #include "name.h"
12 #include "fail.h"
13 #include "types.h"
14 
15 #define gs_op gs_op_t /* fix conflict with fortran */
16 
17 #include "gs_defs.h"
18 #include "gs_local.h"
19 #include "comm.h"
20 #include "mem.h"
21 #include "sort.h"
22 #include "crystal.h"
23 #include "sarray_sort.h"
24 #include "sarray_transfer.h"
25 
26 #define gs PREFIXED_NAME(gs )
27 #define gs_vec PREFIXED_NAME(gs_vec )
28 #define gs_many PREFIXED_NAME(gs_many )
29 #define gs_setup PREFIXED_NAME(gs_setup )
30 #define gs_free PREFIXED_NAME(gs_free )
31 #define gs_unique PREFIXED_NAME(gs_unique)
32 
34 
35 typedef enum { mode_plain, mode_vec, mode_many,
37 
39 
40 static void gather_noop(
41  void *out, const void *in, const unsigned vn,
42  const uint *map, gs_dom dom, gs_op op)
43 {}
44 
45 static void scatter_noop(
46  void *out, const void *in, const unsigned vn,
47  const uint *map, gs_dom dom)
48 {}
49 
50 static void init_noop(
51  void *out, const unsigned vn,
52  const uint *map, gs_dom dom, gs_op op)
53 {}
54 
55 /*------------------------------------------------------------------------------
56  Topology Discovery
57 ------------------------------------------------------------------------------*/
58 
59 struct gs_topology {
60  ulong total_shared; /* number of globally unique shared ids */
61  struct array nz; /* array of nonzero_id's, grouped by id,
62  sorted by primary index, then flag, then index */
63  struct array sh; /* array of shared_id's, arbitrary ordering */
64  struct array pr; /* array of primary_shared_id's */
65 };
66 
67 static void gs_topology_free(struct gs_topology *top)
68 {
69  array_free(&top->pr);
70  array_free(&top->sh);
71  array_free(&top->nz);
72 }
73 
74 /************** Local topology **************/
75 
76 /* nonzero_ids (local part)
77 
78  Creates an array of s_nonzeros, one per nonzero in user id array. The
79  output array is grouped by id. Within each group, non-flagged entries come
80  first; otherwise the entries within the group are sorted by the index into
81  the user id array. The first index in each group is the primary index, and
82  is stored along with each entry. The groups themselves are ordered in
83  increasing order of the primary index associated with the group (as opposed
84  to the user id). */
85 
86 struct nonzero_id {
88 };
89 
90 static void nonzero_ids(struct array *nz,
91  const slong *id, const uint n, buffer *buf)
92 {
93  ulong last_id = -(ulong)1;
94  uint i, primary = -(uint)1;
95  struct nonzero_id *row, *end;
96  array_init(struct nonzero_id,nz,n), end=row=nz->ptr;
97  for(i=0;i<n;++i) {
98  slong id_i = id[i], abs_id = iabsl(id_i);
99  if(id_i==0) continue;
100  end->i = i;
101  end->id = abs_id;
102  end->flag = id_i!=abs_id;
103  ++end;
104  }
105  nz->n = end-row;
106  array_resize(struct nonzero_id,nz,nz->n);
107  sarray_sort_2(struct nonzero_id,nz->ptr,nz->n, id,1, flag,0, buf);
108  for(row=nz->ptr,end=row+nz->n;row!=end;++row) {
109  ulong this_id = row->id;
110  if(this_id!=last_id) primary = row->i;
111  row->primary = primary;
112  last_id = this_id;
113  }
114  sarray_sort(struct nonzero_id,nz->ptr,nz->n, primary,0, buf);
115 }
116 
117 /************** Global topology **************/
118 
119 /* construct list of all unique id's on this proc */
121 static void unique_ids(struct array *un, const struct array *nz, const uint np)
122 {
123  struct unique_id *un_row;
124  const struct nonzero_id *nz_row, *nz_end;
125  array_init(struct unique_id,un,nz->n), un_row=un->ptr;
126  for(nz_row=nz->ptr,nz_end=nz_row+nz->n;nz_row!=nz_end;++nz_row) {
127  if(nz_row->i != nz_row->primary) continue;
128  un_row->id = nz_row->id;
129  un_row->work_proc = nz_row->id%np;
130  un_row->src_if = nz_row->flag ? ~nz_row->i : nz_row->i;
131  ++un_row;
132  }
133  un->n = un_row - (struct unique_id*)un->ptr;
134 }
135 
136 /* shared_ids (global part)
137 
138  Creates an array of shared_id's from an array of nonzero_id's. Each entry
139  in the output identifies one id shared with one other processor p.
140  Note: two procs share an id only when at least one of them has it unflagged.
141  The primary index is i locally and ri remotely. Bit 1 of flags indicates
142  the local flag, bit 2 indicates the remote flag. The output has no
143  particular ordering.
144 
145  Also creates an array of primary_shared_id's, one for each shared id.
146  This struct includes ord, a global rank of the id (arbitrary, but unique). */
147 
148 #define FLAGS_LOCAL 1
149 #define FLAGS_REMOTE 2
150 
151 /* i : local primary index
152  p : remote proc
153  ri : remote primary index
154  bi : buffer index (set and used during pw setup) */
155 struct shared_id {
156  ulong id; uint i, p, ri, bi; unsigned flags;
157 };
158 
160  ulong id, ord; uint i; unsigned flag;
161 };
162 
163 
164 
166 static void shared_ids_aux(struct array *sh, struct array *pr, uint pr_n,
167  struct array *wa, buffer *buf)
168 {
169  const struct shared_id_work *w, *we;
170  struct shared_id *s;
171  struct primary_shared_id *p;
172  ulong last_id = -(ulong)1;
173  /* translate work array to output arrays */
174  sarray_sort(struct shared_id_work,wa->ptr,wa->n, id,1, buf);
175  array_init(struct shared_id,sh,wa->n), sh->n=wa->n, s=sh->ptr;
176  array_init(struct primary_shared_id,pr,pr_n), p=pr->ptr;
177  for(w=wa->ptr,we=w+wa->n;w!=we;++w) {
178  uint i1f = w->i1f, i2f = w->i2f;
179  uint i1 = ~i1f<i1f?~i1f:i1f, i2 = ~i2f<i2f?~i2f:i2f;
180  s->id=w->id, s->i=i1, s->p=w->p2, s->ri=i2;
181  s->flags = ((i2f^i2)&FLAGS_REMOTE) | ((i1f^i1)&FLAGS_LOCAL);
182  ++s;
183  if(w->id!=last_id) {
184  last_id=w->id;
185  p->id=last_id, p->ord=w->ord, p->i=i1, p->flag=(i1f^i1)&FLAGS_LOCAL;
186  ++p;
187  }
188  }
189  pr->n = p-(struct primary_shared_id*)pr->ptr;
190  sarray_sort(struct primary_shared_id,pr->ptr,pr->n, i,0, buf);
191 }
192 
193 static ulong shared_ids(struct array *sh, struct array *pr,
194  const struct array *nz, struct crystal *cr)
195 {
196  struct array un; struct unique_id *un_row, *un_end, *other;
197  ulong last_id = -(ulong)1;
198  ulong ordinal[2], n_shared=0, scan_buf[2];
199  struct array wa; struct shared_id_work *w;
200  uint n_unique;
201  /* construct list of all unique id's on this proc */
202  unique_ids(&un,nz,cr->comm.np);
203  n_unique = un.n;
204  /* transfer list to work procs */
205  sarray_transfer(struct unique_id,&un, work_proc,1, cr);
206  /* group by id, put flagged entries after unflagged (within each group) */
207  sarray_sort_2(struct unique_id,un.ptr,un.n, id,1, src_if,0, &cr->data);
208  /* count shared id's */
209  for(un_row=un.ptr,un_end=un_row+un.n;un_row!=un_end;++un_row) {
210  ulong id = un_row->id;
211  if(~un_row->src_if<un_row->src_if) continue;
212  if(id==last_id) continue;
213  other=un_row+1;
214  if(other!=un_end&&other->id==id) last_id=id, ++n_shared;
215  }
216  comm_scan(ordinal, &cr->comm,gs_slong,gs_add, &n_shared,1, scan_buf);
217  /* there are ordinal[1] globally shared unique ids;
218  and ordinal[0] of those are seen by work procs of lower rank;
219  i.e., this work processor sees the range ordinal[0] + (0,n_shared-1) */
220  /* construct list of shared ids */
221  last_id = -(ulong)1;
222  array_init(struct shared_id_work,&wa,un.n), wa.n=0, w=wa.ptr;
223  for(un_row=un.ptr,un_end=un_row+un.n;un_row!=un_end;++un_row) {
224  ulong id = un_row->id;
225  uint p1 = un_row->work_proc, i1f = un_row->src_if;
226  if(~i1f<i1f) continue;
227  for(other=un_row+1;other!=un_end&&other->id==id;++other) {
228  uint p2 = other->work_proc, i2f = other->src_if;
229  ulong ord;
230  if(id!=last_id) last_id=id, ++ordinal[0];
231  ord=ordinal[0]-1;
232  if(wa.n+2>wa.max)
233  array_reserve(struct shared_id_work,&wa,wa.n+2),
234  w=(struct shared_id_work*)wa.ptr+wa.n;
235  w->id=id, w->ord=ord, w->p1=p1, w->p2=p2, w->i1f=i1f, w->i2f=i2f, ++w;
236  w->id=id, w->ord=ord, w->p1=p2, w->p2=p1, w->i1f=i2f, w->i2f=i1f, ++w;
237  wa.n+=2;
238  }
239  }
240  /* transfer shared list to source procs */
241  sarray_transfer(struct shared_id_work,&wa, p1,0, cr);
242  /* fill output arrays from work array */
243  shared_ids_aux(sh,pr,n_unique,&wa,&cr->data);
244  array_free(&un);
245  array_free(&wa);
246  return ordinal[1];
247 }
248 
249 static void get_topology(struct gs_topology *top,
250  const slong *id, uint n, struct crystal *cr)
251 {
252  nonzero_ids(&top->nz,id,n,&cr->data);
253  top->total_shared = shared_ids(&top->sh,&top->pr, &top->nz,cr);
254 }
255 
256 static void make_topology_unique(struct gs_topology *top, slong *id,
257  uint pid, buffer *buf)
258 {
259  struct array *const nz=&top->nz, *const sh=&top->sh, *const pr=&top->pr;
260  struct nonzero_id *pnz;
261  struct shared_id *pb, *pe, *e, *out;
262  struct primary_shared_id *q;
263 
264  /* flag local non-primaries */
265  sarray_sort(struct nonzero_id,nz->ptr,nz->n, i,0, buf);
266  if(id) {
267  struct nonzero_id *p,*e;
268  for(p=nz->ptr,e=p+nz->n;p!=e;++p)
269  if(p->i != p->primary) id[p->i]=-(slong)p->id,p->flag=1;
270  } else {
271  struct nonzero_id *p,*e;
272  for(p=nz->ptr,e=p+nz->n;p!=e;++p)
273  if(p->i != p->primary) p->flag=1;
274  }
275  sarray_sort(struct nonzero_id,nz->ptr,nz->n, primary,0, buf);
276 
277  /* assign owner among shared primaries */
278 
279  /* create sentinel with i = -1 */
280  array_reserve(struct shared_id,sh,sh->n+1);
281  ((struct shared_id*)sh->ptr)[sh->n].i = -(uint)1;
282  /* in the sorted list of procs sharing a given id,
283  the owner is chosen to be the j^th unflagged proc,
284  where j = id mod (length of list) */
285  sarray_sort_2(struct shared_id,sh->ptr,sh->n, i,0, p,0, buf);
286  out=sh->ptr; pnz=top->nz.ptr;
287  for(pb=sh->ptr,e=pb+sh->n;pb!=e;pb=pe) {
288  uint i = pb->i, lt=0,gt=0, owner; struct shared_id *p;
289  while(pnz->i!=i) ++pnz;
290  /* note: current proc not in list */
291  for(pe=pb; pe->i==i && pe->p<pid; ++pe) if(!(pe->flags&FLAGS_REMOTE)) ++lt;
292  for( ; pe->i==i ; ++pe) if(!(pe->flags&FLAGS_REMOTE)) ++gt;
293  if(!(pb->flags&FLAGS_LOCAL)) {
294  owner = pb->id%(lt+gt+1);
295  if(owner==lt) goto make_sh_unique_mine;
296  if(owner>lt) --owner;
297  } else
298  owner = pb->id%(lt+gt);
299  /* we don't own pb->id */
300  if(id) id[i] = -(slong)pb->id;
301  pnz->flag=1;
302  /* we only share this id with the owner now; remove the other entries */
303  for(p=pb; p!=pe; ++p) if(!(p->flags&FLAGS_REMOTE) && !(owner--)) break;
304  if(p!=pe) *out=*p, out->flags=FLAGS_LOCAL, ++out;
305  continue;
306  make_sh_unique_mine:
307  /* we own pb->id */
308  if(out==pb) { out=pe; for(p=pb; p!=pe; ++p) p->flags=FLAGS_REMOTE; }
309  else for(p=pb; p!=pe; ++p) *out=*p,out->flags=FLAGS_REMOTE,++out;
310  }
311  sh->n = out - ((struct shared_id*)sh->ptr);
312 
313  /* set primary_shared_id flags to match */
314  ((struct shared_id*)sh->ptr)[sh->n].i = -(uint)1;
315  sarray_sort(struct shared_id,sh->ptr,sh->n, id,1, buf);
316  sarray_sort(struct primary_shared_id,pr->ptr,pr->n, id,1, buf);
317  q=pr->ptr;
318  for(pb=sh->ptr,e=pb+sh->n;pb!=e;pb=pe) {
319  uint i=pb->i;
320  pe=pb; while(pe->i==i) ++pe;
321  if(q->id!=pb->id) printf("FAIL!!!\n");
322  q->flag=pb->flags&FLAGS_LOCAL;
323  ++q;
324  }
325 }
326 
327 /*------------------------------------------------------------------------------
328  Local setup
329 ------------------------------------------------------------------------------*/
330 
331 /* assumes nz is sorted by primary, then flag, then index */
332 static const uint *local_map(const struct array *nz, const int ignore_flagged,
333  uint *mem_size)
334 {
335  uint *map, *p, count = 1;
336  const struct nonzero_id *row, *other, *end;
337 #define DO_COUNT(cond) do \
338  for(row=nz->ptr,end=row+nz->n;row!=end;) { \
339  ulong row_id = row->id; int any=0; \
340  for(other=row+1;other!=end&&other->id==row_id&&cond;++other) \
341  any=2, ++count; \
342  count+=any, row=other; \
343  } while(0)
344  if(ignore_flagged) DO_COUNT(other->flag==0); else DO_COUNT(1);
345 #undef DO_COUNT
346  p = map = tmalloc(uint,count); *mem_size += count*sizeof(uint);
347 #define DO_SET(cond) do \
348  for(row=nz->ptr,end=row+nz->n;row!=end;) { \
349  ulong row_id = row->id; int any=0; \
350  *p++ = row->i; \
351  for(other=row+1;other!=end&&other->id==row_id&&cond;++other) \
352  any=1, *p++ = other->i; \
353  if(any) *p++ = -(uint)1; else --p; \
354  row=other; \
355  } while(0)
356  if(ignore_flagged) DO_SET(other->flag==0); else DO_SET(1);
357 #undef DO_SET
358  *p = -(uint)1;
359  return map;
360 }
361 
362 static const uint *flagged_primaries_map(const struct array *nz, uint *mem_size)
363 {
364  uint *map, *p, count=1;
365  const struct nonzero_id *row, *end;
366  for(row=nz->ptr,end=row+nz->n;row!=end;++row)
367  if(row->i==row->primary && row->flag==1) ++count;
368  p = map = tmalloc(uint,count); *mem_size += count*sizeof(uint);
369  for(row=nz->ptr,end=row+nz->n;row!=end;++row)
370  if(row->i==row->primary && row->flag==1) *p++ = row->i;
371  *p = -(uint)1;
372  return map;
373 }
374 
375 /*------------------------------------------------------------------------------
376  Remote execution and setup
377 ------------------------------------------------------------------------------*/
378 
379 typedef void exec_fun(
380  void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op,
381  unsigned transpose, const void *execdata, const struct comm *comm, char *buf);
382 typedef void fin_fun(void *data);
383 
384 struct gs_remote {
386  void *data;
389 };
390 
391 typedef void setup_fun(struct gs_remote *r, struct gs_topology *top,
392  const struct comm *comm, buffer *buf);
393 
394 /*------------------------------------------------------------------------------
395  Pairwise Execution
396 ------------------------------------------------------------------------------*/
397 struct pw_comm_data {
398  uint n; /* number of messages */
399  uint *p; /* message source/dest proc */
400  uint *size; /* size of message */
401  uint total; /* sum of message sizes */
402 };
403 
404 struct pw_data {
405  struct pw_comm_data comm[2];
406  const uint *map[2];
409 };
410 
411 static char *pw_exec_recvs(char *buf, const unsigned unit_size,
412  const struct comm *comm,
413  const struct pw_comm_data *c, comm_req *req)
414 {
415  const uint *p, *pe, *size=c->size;
416  for(p=c->p,pe=p+c->n;p!=pe;++p) {
417  size_t len = *(size++)*unit_size;
418  comm_irecv(req++,comm,buf,len,*p,*p);
419  buf += len;
420  }
421  return buf;
422 }
423 
424 static char *pw_exec_sends(char *buf, const unsigned unit_size,
425  const struct comm *comm,
426  const struct pw_comm_data *c, comm_req *req)
427 {
428  const uint *p, *pe, *size=c->size;
429  for(p=c->p,pe=p+c->n;p!=pe;++p) {
430  size_t len = *(size++)*unit_size;
431  comm_isend(req++,comm,buf,len,*p,comm->id);
432  buf += len;
433  }
434  return buf;
435 }
436 
437 static void pw_exec(
438  void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op,
439  unsigned transpose, const void *execdata, const struct comm *comm, char *buf)
440 {
441  const struct pw_data *pwd = execdata;
442  static gs_scatter_fun *const scatter_to_buf[] =
444  static gs_gather_fun *const gather_from_buf[] =
446  const unsigned recv = 0^transpose, send = 1^transpose;
447  unsigned unit_size = vn*gs_dom_size[dom];
448  char *sendbuf;
449  /* post receives */
450  sendbuf = pw_exec_recvs(buf,unit_size,comm,&pwd->comm[recv],pwd->req);
451  /* fill send buffer */
452  scatter_to_buf[mode](sendbuf,data,vn,pwd->map[send],dom);
453  /* post sends */
454  pw_exec_sends(sendbuf,unit_size,comm,&pwd->comm[send],
455  &pwd->req[pwd->comm[recv].n]);
456  comm_wait(pwd->req,pwd->comm[0].n+pwd->comm[1].n);
457  /* gather using recv buffer */
458  gather_from_buf[mode](data,buf,vn,pwd->map[recv],dom,op);
459 }
460 
461 /*------------------------------------------------------------------------------
462  Pairwise setup
463 ------------------------------------------------------------------------------*/
464 static uint pw_comm_setup(struct pw_comm_data *data, struct array *sh,
465  const unsigned flags_mask, buffer *buf)
466 {
467  uint n=0,count=0, lp=-(uint)1, mem_size=0;
468  struct shared_id *s, *se;
469  /* sort by remote processor and id (a globally consistent ordering) */
470  sarray_sort_2(struct shared_id,sh->ptr,sh->n, p,0, id,1, buf);
471  /* assign index into buffer */
472  for(s=sh->ptr,se=s+sh->n;s!=se;++s) {
473  if(s->flags&flags_mask) { s->bi = -(uint)1; continue; }
474  s->bi = count++;
475  if(s->p!=lp) lp=s->p, ++n;
476  }
477  data->n = n;
478  data->p = tmalloc(uint,2*n); mem_size+=2*n*sizeof(uint);
479  data->size = data->p + n;
480  data->total = count;
481  n = 0, lp=-(uint)1;
482  for(s=sh->ptr,se=s+sh->n;s!=se;++s) {
483  if(s->flags&flags_mask) continue;
484  if(s->p!=lp) {
485  lp=s->p;
486  if(n!=0) data->size[n-1] = count;
487  count=0, data->p[n++]=lp;
488  }
489  ++count;
490  }
491  if(n!=0) data->size[n-1] = count;
492  return mem_size;
493 }
494 
495 static void pw_comm_free(struct pw_comm_data *data) { free(data->p); }
496 
497 /* assumes that the bi field of sh is set */
498 static const uint *pw_map_setup(struct array *sh, buffer *buf, uint *mem_size)
499 {
500  uint count=0, *map, *p;
501  struct shared_id *s, *se;
502  sarray_sort(struct shared_id,sh->ptr,sh->n, i,0, buf);
503  /* calculate map size */
504  count=1;
505  for(s=sh->ptr,se=s+sh->n;s!=se;) {
506  uint i=s->i;
507  if(s->bi==-(uint)1) { ++s; continue; }
508  count+=3;
509  for(++s;s!=se&&s->i==i;++s) if(s->bi!=-(uint)1) ++count;
510  }
511  /* write map */
512  p = map = tmalloc(uint,count); *mem_size += count*sizeof(uint);
513  for(s=sh->ptr,se=s+sh->n;s!=se;) {
514  uint i=s->i;
515  if(s->bi==-(uint)1) { ++s; continue; }
516  *p++ = i, *p++ = s->bi;
517  for(++s;s!=se&&s->i==i;++s) if(s->bi!=-(uint)1) *p++ = s->bi;
518  *p++ = -(uint)1;
519  }
520  *p = -(uint)1;
521  return map;
522 }
523 
524 static struct pw_data *pw_setup_aux(struct array *sh, buffer *buf,
525  uint *mem_size)
526 {
527  struct pw_data *pwd = tmalloc(struct pw_data,1);
528  *mem_size = sizeof(struct pw_data);
529 
530  /* default behavior: receive only remotely unflagged data */
531  *mem_size+=pw_comm_setup(&pwd->comm[0],sh, FLAGS_REMOTE, buf);
532  pwd->map[0] = pw_map_setup(sh, buf, mem_size);
533 
534  /* default behavior: send only locally unflagged data */
535  *mem_size+=pw_comm_setup(&pwd->comm[1],sh, FLAGS_LOCAL, buf);
536  pwd->map[1] = pw_map_setup(sh, buf, mem_size);
537 
538  pwd->req = tmalloc(comm_req,pwd->comm[0].n+pwd->comm[1].n);
539  *mem_size += (pwd->comm[0].n+pwd->comm[1].n)*sizeof(comm_req);
540  pwd->buffer_size = pwd->comm[0].total + pwd->comm[1].total;
541  return pwd;
542 }
543 
544 static void pw_free(struct pw_data *data)
545 {
546  pw_comm_free(&data->comm[0]);
547  pw_comm_free(&data->comm[1]);
548  free((uint*)data->map[0]);
549  free((uint*)data->map[1]);
550  free(data->req);
551  free(data);
552 }
553 
554 static void pw_setup(struct gs_remote *r, struct gs_topology *top,
555  const struct comm *comm, buffer *buf)
556 {
557  struct pw_data *pwd = pw_setup_aux(&top->sh,buf, &r->mem_size);
558  r->buffer_size = pwd->buffer_size;
559  r->data = pwd;
560  r->exec = (exec_fun*)&pw_exec;
561  r->fin = (fin_fun*)&pw_free;
562 }
563 
564 /*------------------------------------------------------------------------------
565  Crystal-Router Execution
566 ------------------------------------------------------------------------------*/
567 struct cr_stage {
572  unsigned nrecvn;
573 };
574 
575 struct cr_data {
576  struct cr_stage *stage[2];
577  unsigned nstages;
579 };
580 
581 static void cr_exec(
582  void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op,
583  unsigned transpose, const void *execdata, const struct comm *comm, char *buf)
584 {
585  const struct cr_data *crd = execdata;
586  static gs_scatter_fun *const scatter_user_to_buf[] =
588  static gs_scatter_fun *const scatter_buf_to_buf[] =
590  static gs_scatter_fun *const scatter_buf_to_user[] =
592  static gs_gather_fun *const gather_buf_to_user[] =
594  static gs_gather_fun *const gather_buf_to_buf[] =
596  const unsigned unit_size = vn*gs_dom_size[dom], nstages=crd->nstages;
597  unsigned k;
598  char *sendbuf, *buf_old, *buf_new;
599  const struct cr_stage *stage = crd->stage[transpose];
600  buf_old = buf;
601  buf_new = buf_old + unit_size*crd->stage_buffer_size;
602  /* crystal router */
603  for(k=0;k<nstages;++k) {
604  comm_req req[3];
605  if(stage[k].nrecvn)
606  comm_irecv(&req[1],comm,buf_new,unit_size*stage[k].size_r1,
607  stage[k].p1, comm->np+k);
608  if(stage[k].nrecvn==2)
609  comm_irecv(&req[2],comm,buf_new+unit_size*stage[k].size_r1,
610  unit_size*stage[k].size_r2, stage[k].p2, comm->np+k);
611  sendbuf = buf_new+unit_size*stage[k].size_r;
612  if(k==0)
613  scatter_user_to_buf[mode](sendbuf,data,vn,stage[0].scatter_map,dom);
614  else
615  scatter_buf_to_buf[mode](sendbuf,buf_old,vn,stage[k].scatter_map,dom),
616  gather_buf_to_buf [mode](sendbuf,buf_old,vn,stage[k].gather_map ,dom,op);
617 
618  comm_isend(&req[0],comm,sendbuf,unit_size*stage[k].size_s,
619  stage[k].p1, comm->np+k);
620  comm_wait(&req[0],1+stage[k].nrecvn);
621  { char *t = buf_old; buf_old=buf_new; buf_new=t; }
622  }
623  scatter_buf_to_user[mode](data,buf_old,vn,stage[k].scatter_map,dom);
624  gather_buf_to_user [mode](data,buf_old,vn,stage[k].gather_map ,dom,op);
625 }
626 
627 /*------------------------------------------------------------------------------
628  Crystal-Router setup
629 ------------------------------------------------------------------------------*/
630 static uint cr_schedule(struct cr_data *data, const struct comm *comm)
631 {
632  uint mem_size = 0;
633  const uint id = comm->id;
634  uint bl=0, n=comm->np;
635  unsigned k = 0;
636  while(n>1) {
637  uint nl = (n+1)/2, bh = bl+nl;
638  if(id<bh) n=nl; else n-=nl,bl=bh;
639  ++k;
640  }
641  data->nstages = k;
642  data->stage[0] = tmalloc(struct cr_stage,2*(k+1));
643  data->stage[1] = data->stage[0] + (k+1);
644  mem_size += 2*(k+1)*sizeof(struct cr_stage);
645  bl=0, n=comm->np, k=0;
646  while(n>1) {
647  uint nl = (n+1)/2, bh = bl+nl;
648  uint targ; unsigned recvn;
649  recvn = 1, targ = n-1-(id-bl)+bl;
650  if(id==targ) targ=bh, recvn=0;
651  if(n&1 && id==bh) recvn=2;
652  data->stage[1][k].nrecvn=data->stage[0][k].nrecvn=recvn;
653  data->stage[1][k].p1 =data->stage[0][k].p1 =targ;
654  data->stage[1][k].p2 =data->stage[0][k].p2 =comm->id-1;
655  if(id<bh) n=nl; else n-=nl,bl=bh;
656  ++k;
657  }
658  return mem_size;
659 }
660 
661 struct crl_id {
663 };
664 
665 /* assumes sh is grouped by i (e.g., sorted by i or by id) */
666 static void crl_work_init(struct array *cw, struct array *sh,
667  const unsigned send_mask, uint this_p)
668 {
669  const unsigned recv_mask = send_mask^(FLAGS_REMOTE|FLAGS_LOCAL);
670  uint last_i=-(uint)1; int added_myself;
671  uint cw_n = 0, cw_max = cw->max;
672  struct crl_id *w = cw->ptr;
673  struct shared_id *s, *se;
674 
675 #define CW_ADD(aid,ap,ari,asi) do { \
676  if(cw_n==cw_max) \
677  array_reserve(struct crl_id,cw,cw_n+1),cw_max=cw->max, \
678  w=(struct crl_id*)cw->ptr+cw_n; \
679  w->id=aid, w->p=ap, w->ri=ari, w->si=asi; \
680  ++w, ++cw_n; \
681  } while(0)
682 
683  for(s=sh->ptr,se=s+sh->n;s!=se;++s) {
684  int send = (s->flags&send_mask)==0;
685  int recv = (s->flags&recv_mask)==0;
686  if(s->i!=last_i) last_i=s->i, added_myself=0;
687  if(!added_myself && recv && (s->flags&FLAGS_LOCAL)==0) {
688  added_myself=1;
689  CW_ADD(s->id,this_p,s->i,s->i);
690  }
691  if(send) CW_ADD(s->id,s->p,s->ri,s->i);
692  }
693  cw->n=cw_n;
694 #undef CW_ADD
695 }
696 
697 static uint crl_maps(struct cr_stage *stage, struct array *cw, buffer *buf)
698 {
699  uint mem_size=0;
700  struct crl_id *w, *we, *other;
701  uint scount=1, gcount=1, *sp, *gp;
702  sarray_sort_2(struct crl_id,cw->ptr,cw->n, bi,0, si,0, buf);
703  for(w=cw->ptr,we=w+cw->n;w!=we;w=other) {
704  uint bi=w->bi,any=0,si=w->si;
705  scount+=3;
706  for(other=w+1;other!=we&&other->bi==bi;++other)
707  if(other->si!=si) si=other->si, any=2, ++gcount;
708  gcount+=any;
709  }
710  stage->scatter_map = sp = tmalloc(uint,scount+gcount);
711  stage->gather_map = gp = sp + scount;
712  mem_size += (scount+gcount)*sizeof(uint);
713  for(w=cw->ptr,we=w+cw->n;w!=we;w=other) {
714  uint bi=w->bi,any=0,si=w->si;
715  *sp++ = w->si, *sp++ = bi;
716  *gp++ = bi;
717  for(other=w+1;other!=we&&other->bi==bi;++other)
718  if(other->si!=si) si=other->si, any=1, *gp++ = si;
719  if(any) *gp++ = -(uint)1; else --gp;
720  *sp++ = -(uint)1;
721  }
722  *sp=-(uint)1, *gp=-(uint)1;
723  return mem_size;
724 }
725 
726 static uint crl_work_label(struct array *cw, struct cr_stage *stage,
727  uint cutoff, int send_hi, buffer *buf,
728  uint *mem_size)
729 {
730  struct crl_id *w, *we, *start;
731  uint nsend, nkeep = 0, nks = 0, bi=0;
732  /* here w->send has a reverse meaning */
733  if(send_hi) for(w=cw->ptr,we=w+cw->n;w!=we;++w) w->send = w->p< cutoff;
734  else for(w=cw->ptr,we=w+cw->n;w!=we;++w) w->send = w->p>=cutoff;
735  sarray_sort_2(struct crl_id,cw->ptr,cw->n, id,1, send,0, buf);
736  for(start=cw->ptr,w=start,we=w+cw->n;w!=we;++w) {
737  nkeep += w->send;
738  if(w->id!=start->id) start=w;
739  if(w->send!=start->send) w->send=0,w->bi=1, ++nks; else w->bi=0;
740  }
741  nsend = cw->n-nkeep;
742  /* assign indices; sent ids have priority (hence w->send is reversed) */
743  sarray_sort(struct crl_id,cw->ptr,cw->n, send,0, buf);
744  for(start=cw->ptr,w=start,we=w+nsend+nks;w!=we;++w) {
745  if(w->id!=start->id) start=w, ++bi;
746  if(w->bi!=1) w->send=1; /* switch back to the usual semantics */
747  w->bi = bi;
748  }
749  stage->size_s = nsend+nks==0 ? 0 : bi+1;
750  for(we=(struct crl_id*)cw->ptr+cw->n;w!=we;++w) {
751  if(w->id!=start->id) start=w, ++bi;
752  w->send = 0; /* switch back to the usual semantics */
753  w->bi = bi;
754  }
755  stage->size_sk = cw->n==0 ? 0 : bi+1;
756  *mem_size += crl_maps(stage,cw,buf);
757  return nsend;
758 }
759 
760 static void crl_bi_to_si(struct crl_id *w, uint n, uint v) {
761  for(;n;--n) w->si=w->bi+v, ++w;
762 }
763 
764 static void crl_ri_to_bi(struct crl_id *w, uint n) {
765  for(;n;--n) w->bi=w->ri, ++w;
766 }
767 
768 static uint cr_learn(struct array *cw, struct cr_stage *stage,
769  const struct comm *comm, buffer *buf, uint *mem_size)
770 {
771  comm_req req[3];
772  const uint id = comm->id;
773  uint bl=0, n=comm->np;
774  uint size_max=0;
775  uint tag = comm->np;
776  while(n>1) {
777  uint nl = (n+1)/2, bh = bl+nl;
778  uint nkeep, nsend[2], nrecv[2][2] = {{0,0},{0,0}};
779  struct crl_id *wrecv[2], *wsend;
780  nsend[0] = crl_work_label(cw,stage,bh,id<bh,buf, mem_size);
781  nsend[1] = stage->size_s;
782  nkeep = cw->n - nsend[0];
783 
784  if(stage->nrecvn ) comm_irecv(&req[1],comm,nrecv[0],2*sizeof(uint),
785  stage->p1,tag);
786  if(stage->nrecvn==2) comm_irecv(&req[2],comm,nrecv[1],2*sizeof(uint),
787  stage->p2,tag);
788  comm_isend(&req[0],comm,nsend,2*sizeof(uint),stage->p1,tag);
789  comm_wait(req,1+stage->nrecvn),++tag;
790 
791  stage->size_r1 = nrecv[0][1], stage->size_r2 = nrecv[1][1];
792  stage->size_r = stage->size_r1 + stage->size_r2;
793  stage->size_total = stage->size_r + stage->size_sk;
794  if(stage->size_total>size_max) size_max=stage->size_total;
795 
796  array_reserve(struct crl_id,cw,cw->n+nrecv[0][0]+nrecv[1][0]);
797  wrecv[0] = cw->ptr, wrecv[0] += cw->n, wrecv[1] = wrecv[0]+nrecv[0][0];
798  wsend = cw->ptr, wsend += nkeep;
799  if(stage->nrecvn )
800  comm_irecv(&req[1],comm,wrecv[0],nrecv[0][0]*sizeof(struct crl_id),
801  stage->p1,tag);
802  if(stage->nrecvn==2)
803  comm_irecv(&req[2],comm,wrecv[1],nrecv[1][0]*sizeof(struct crl_id),
804  stage->p2,tag);
805  sarray_sort_2(struct crl_id,cw->ptr,cw->n, send,0, bi,0, buf);
806  comm_isend(&req[0],comm,wsend,nsend[0]*sizeof(struct crl_id),stage->p1,tag);
807  comm_wait(req,1+stage->nrecvn),++tag;
808 
809  crl_bi_to_si(cw->ptr,nkeep,stage->size_r);
810  if(stage->nrecvn) crl_bi_to_si(wrecv[0],nrecv[0][0],0);
811  if(stage->nrecvn==2) crl_bi_to_si(wrecv[1],nrecv[1][0],stage->size_r1);
812  memmove(wsend,wrecv[0],(nrecv[0][0]+nrecv[1][0])*sizeof(struct crl_id));
813  cw->n += nrecv[0][0] + nrecv[1][0];
814  cw->n -= nsend[0];
815 
816  if(id<bh) n=nl; else n-=nl,bl=bh;
817  ++stage;
818  }
819  crl_ri_to_bi(cw->ptr,cw->n);
820  *mem_size += crl_maps(stage,cw,buf);
821  return size_max;
822 }
823 
824 static struct cr_data *cr_setup_aux(
825  struct array *sh, const struct comm *comm, buffer *buf, uint *mem_size)
826 {
827  uint size_max[2];
828  struct array cw = null_array;
829  struct cr_data *crd = tmalloc(struct cr_data,1);
830  *mem_size = sizeof(struct cr_data);
831 
832  /* default behavior: receive only remotely unflagged data */
833  /* default behavior: send only locally unflagged data */
834 
835  *mem_size += cr_schedule(crd,comm);
836 
837  sarray_sort(struct shared_id,sh->ptr,sh->n, i,0, buf);
838  crl_work_init(&cw,sh, FLAGS_LOCAL , comm->id);
839  size_max[0]=cr_learn(&cw,crd->stage[0],comm,buf, mem_size);
840  crl_work_init(&cw,sh, FLAGS_REMOTE, comm->id);
841  size_max[1]=cr_learn(&cw,crd->stage[1],comm,buf, mem_size);
842 
843  crd->stage_buffer_size = size_max[1]>size_max[0]?size_max[1]:size_max[0];
844 
845  array_free(&cw);
846 
847  crd->buffer_size = 2*crd->stage_buffer_size;
848  return crd;
849 }
850 
851 static void cr_free_stage_maps(struct cr_stage *stage, unsigned kmax)
852 {
853  unsigned k;
854  for(k=0; k<kmax; ++k) {
855  free((uint*)stage->scatter_map);
856  ++stage;
857  }
858  free((uint*)stage->scatter_map);
859 }
860 
861 static void cr_free(struct cr_data *data)
862 {
863  cr_free_stage_maps(data->stage[0],data->nstages);
864  cr_free_stage_maps(data->stage[1],data->nstages);
865  free(data->stage[0]);
866  free(data);
867 }
868 
869 static void cr_setup(struct gs_remote *r, struct gs_topology *top,
870  const struct comm *comm, buffer *buf)
871 {
872  struct cr_data *crd = cr_setup_aux(&top->sh,comm,buf, &r->mem_size);
873  r->buffer_size = crd->buffer_size;
874  r->data = crd;
875  r->exec = (exec_fun*)&cr_exec;
876  r->fin = (fin_fun*)&cr_free;
877 }
878 
879 /*------------------------------------------------------------------------------
880  All-reduce Execution
881 ------------------------------------------------------------------------------*/
883  const uint *map_to_buf[2], *map_from_buf[2];
885 };
886 
887 static void allreduce_exec(
888  void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op,
889  unsigned transpose, const void *execdata, const struct comm *comm, char *buf)
890 {
891  const struct allreduce_data *ard = execdata;
892  static gs_scatter_fun *const scatter_to_buf[] =
894  static gs_scatter_fun *const scatter_from_buf[] =
896  uint gvn = vn*(ard->buffer_size/2);
897  unsigned unit_size = gs_dom_size[dom];
898  char *ardbuf;
899  ardbuf = buf+unit_size*gvn;
900  /* user array -> buffer */
901  gs_init_array(buf,gvn,dom,op);
902  scatter_to_buf[mode](buf,data,vn,ard->map_to_buf[transpose],dom);
903  /* all reduce */
904  comm_allreduce(comm,dom,op, buf,gvn, ardbuf);
905  /* buffer -> user array */
906  scatter_from_buf[mode](data,buf,vn,ard->map_from_buf[transpose],dom);
907 }
908 
909 /*------------------------------------------------------------------------------
910  All-reduce setup
911 ------------------------------------------------------------------------------*/
912 static const uint *allreduce_map_setup(
913  struct array *pr, const unsigned flags_mask, int to_buf, uint *mem_size)
914 {
915  struct primary_shared_id *p, *pe;
916  uint count=1, *map, *m;
917  for(p=pr->ptr,pe=p+pr->n;p!=pe;++p)
918  if((p->flag&flags_mask)==0) count+=3;
919  m=map=tmalloc(uint,count); *mem_size += count*sizeof(uint);
920  if(to_buf) {
921  for(p=pr->ptr,pe=p+pr->n;p!=pe;++p)
922  if((p->flag&flags_mask)==0)
923  *m++ = p->i, *m++ = p->ord, *m++ = -(uint)1;
924  } else {
925  for(p=pr->ptr,pe=p+pr->n;p!=pe;++p)
926  if((p->flag&flags_mask)==0)
927  *m++ = p->ord, *m++ = p->i, *m++ = -(uint)1;
928  }
929  *m=-(uint)1;
930  return map;
931 }
932 
934  struct array *pr, ulong total_shared, uint *mem_size)
935 {
936  struct allreduce_data *ard = tmalloc(struct allreduce_data,1);
937  *mem_size = sizeof(struct allreduce_data);
938 
939  /* default behavior: reduce only unflagged data, copy to all */
940  ard->map_to_buf [0] = allreduce_map_setup(pr,1,1, mem_size);
941  ard->map_from_buf[0] = allreduce_map_setup(pr,0,0, mem_size);
942 
943  /* transpose behavior: reduce all data, copy to unflagged */
944  ard->map_to_buf [1] = allreduce_map_setup(pr,0,1, mem_size);
945  ard->map_from_buf[1] = allreduce_map_setup(pr,1,0, mem_size);
946 
947  ard->buffer_size = total_shared*2;
948  return ard;
949 }
950 
951 static void allreduce_free(struct allreduce_data *ard)
952 {
953  free((uint*)ard->map_to_buf[0]);
954  free((uint*)ard->map_to_buf[1]);
955  free((uint*)ard->map_from_buf[0]);
956  free((uint*)ard->map_from_buf[1]);
957  free(ard);
958 }
959 
960 static void allreduce_setup(struct gs_remote *r, struct gs_topology *top,
961  const struct comm *comm, buffer *buf)
962 {
963  struct allreduce_data *ard
964  = allreduce_setup_aux(&top->pr,top->total_shared, &r->mem_size);
965  r->buffer_size = ard->buffer_size;
966  r->data = ard;
967  r->exec = (exec_fun*)&allreduce_exec;
968  r->fin = (fin_fun*)&allreduce_free;
969 }
970 
971 /*------------------------------------------------------------------------------
972  Automatic Setup --- dynamically picks the fastest method
973 ------------------------------------------------------------------------------*/
974 
975 static void dry_run_time(double times[3], const struct gs_remote *r,
976  const struct comm *comm, buffer *buf)
977 {
978  int i; double t;
979  buffer_reserve(buf,gs_dom_size[gs_double]*r->buffer_size);
980  for(i= 2;i;--i)
981  r->exec(0,mode_dry_run,1,gs_double,gs_add,0,r->data,comm,buf->ptr);
982  comm_barrier(comm);
983  t = comm_time();
984  for(i=10;i;--i)
985  r->exec(0,mode_dry_run,1,gs_double,gs_add,0,r->data,comm,buf->ptr);
986  t = (comm_time() - t)/10;
987  times[0] = t/comm->np, times[1] = t, times[2] = t;
988  comm_allreduce(comm,gs_double,gs_add, &times[0],1, &t);
989  comm_allreduce(comm,gs_double,gs_min, &times[1],1, &t);
990  comm_allreduce(comm,gs_double,gs_max, &times[2],1, &t);
991 }
992 
993 static void auto_setup(struct gs_remote *r, struct gs_topology *top,
994  const struct comm *comm, buffer *buf)
995 {
996  pw_setup(r, top,comm,buf);
997 
998  if(comm->np>1) {
999  const char *name = "pairwise";
1000  struct gs_remote r_alt;
1001  double time[2][3];
1002 
1003  #define DRY_RUN(i,gsr,str) do { \
1004  if(comm->id==0) printf(" " str ": "); \
1005  dry_run_time(time[i],gsr,comm,buf); \
1006  if(comm->id==0) \
1007  printf("%g %g %g\n",time[i][0],time[i][1],time[i][2]); \
1008  } while(0)
1009 
1010  #define DRY_RUN_CHECK(str,new_name) do { \
1011  DRY_RUN(1,&r_alt,str); \
1012  if(time[1][2]<time[0][2]) \
1013  time[0][2]=time[1][2], name=new_name, \
1014  r->fin(r->data), *r = r_alt; \
1015  else \
1016  r_alt.fin(r_alt.data); \
1017  } while(0)
1018 
1019  DRY_RUN(0, r, "pairwise times (avg, min, max)");
1020 
1021  cr_setup(&r_alt, top,comm,buf);
1022  DRY_RUN_CHECK( "crystal router ", "crystal router");
1023 
1024  if(top->total_shared<100000) {
1025  allreduce_setup(&r_alt, top,comm,buf);
1026  DRY_RUN_CHECK( "all reduce ", "allreduce");
1027  }
1028 
1029  #undef DRY_RUN_CHECK
1030  #undef DRY_RUN
1031 
1032  if(comm->id==0) printf(" used all_to_all method: %s\n",name);
1033  }
1034 }
1035 
1036 /*------------------------------------------------------------------------------
1037  Main Execution
1038 ------------------------------------------------------------------------------*/
1039 struct gs_data {
1040  struct comm comm;
1041  const uint *map_local[2]; /* 0=unflagged, 1=all */
1043  struct gs_remote r;
1045 };
1046 
1047 static void gs_aux(
1048  void *u, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, unsigned transpose,
1049  struct gs_data *gsh, buffer *buf)
1050 {
1051  static gs_scatter_fun *const local_scatter[] =
1053  static gs_gather_fun *const local_gather [] =
1055  static gs_init_fun *const init[] =
1057  if(!buf) buf = &static_buffer;
1058  buffer_reserve(buf,vn*gs_dom_size[dom]*gsh->r.buffer_size);
1059  local_gather [mode](u,u,vn,gsh->map_local[0^transpose],dom,op);
1060  if(transpose==0) init[mode](u,vn,gsh->flagged_primaries,dom,op);
1061  gsh->r.exec(u,mode,vn,dom,op,transpose,gsh->r.data,&gsh->comm,buf->ptr);
1062  local_scatter[mode](u,u,vn,gsh->map_local[1^transpose],dom);
1063 }
1064 
1065 void gs(void *u, gs_dom dom, gs_op op, unsigned transpose,
1066  struct gs_data *gsh, buffer *buf)
1067 {
1068  gs_aux(u,mode_plain,1,dom,op,transpose,gsh,buf);
1069 }
1070 
1071 void gs_vec(void *u, unsigned vn, gs_dom dom, gs_op op,
1072  unsigned transpose, struct gs_data *gsh, buffer *buf)
1073 {
1074  gs_aux(u,mode_vec,vn,dom,op,transpose,gsh,buf);
1075 }
1076 
1077 void gs_many(void *const*u, unsigned vn, gs_dom dom, gs_op op,
1078  unsigned transpose, struct gs_data *gsh, buffer *buf)
1079 {
1080  gs_aux((void*)u,mode_many,vn,dom,op,transpose,gsh,buf);
1081 }
1082 
1083 /*------------------------------------------------------------------------------
1084  Main Setup
1085 ------------------------------------------------------------------------------*/
1087 
1088 static uint local_setup(struct gs_data *gsh, const struct array *nz)
1089 {
1090  uint mem_size = 0;
1091  gsh->map_local[0] = local_map(nz,1, &mem_size);
1092  gsh->map_local[1] = local_map(nz,0, &mem_size);
1093  gsh->flagged_primaries = flagged_primaries_map(nz, &mem_size);
1094  return mem_size;
1095 }
1096 
1097 static void gs_setup_aux(struct gs_data *gsh, const slong *id, uint n,
1098  int unique, gs_method method, int verbose)
1099 {
1100  static setup_fun *const remote_setup[] =
1102 
1103  struct gs_topology top;
1104  struct crystal cr;
1105 
1106  crystal_init(&cr,&gsh->comm);
1107 
1108  get_topology(&top, id,n, &cr);
1109  if(unique) make_topology_unique(&top,0,gsh->comm.id,&cr.data);
1110 
1111  gsh->handle_size = sizeof(struct gs_data);
1112  gsh->handle_size += local_setup(gsh,&top.nz);
1113 
1114  if(verbose && gsh->comm.id==0)
1115  printf("gs_setup: %ld unique labels shared\n",(long)top.total_shared);
1116 
1117  remote_setup[method](&gsh->r, &top,&gsh->comm,&cr.data);
1118  gsh->handle_size += gsh->r.mem_size;
1119 
1120  if(verbose) { /* report memory usage */
1121  double avg[2],td[2]; uint min[2],max[2],ti[2];
1122  avg[0] = min[0] = max[0] = gsh->handle_size;
1123  avg[1] = min[1] = max[1] = sizeof(double)*gsh->r.buffer_size;
1124  avg[0] /= gsh->comm.np; avg[1] /= gsh->comm.np;
1125  comm_allreduce(&gsh->comm,gs_double,gs_add, avg,2, td);
1126  comm_allreduce(&gsh->comm,gs_sint,gs_min, min,2, ti);
1127  comm_allreduce(&gsh->comm,gs_sint,gs_max, max,2, ti);
1128  if(gsh->comm.id==0) {
1129  printf(" " "handle bytes (avg, min, max)" ": " "%g %u %u\n",
1130  avg[0], (unsigned)min[0], (unsigned)max[0]);
1131  printf(" " "buffer bytes (avg, min, max)" ": " "%g %u %u\n",
1132  avg[1], (unsigned)min[1], (unsigned)max[1]);
1133  }
1134  }
1135 
1136  gs_topology_free(&top);
1137  crystal_free(&cr);
1138 }
1139 
1140 struct gs_data *gs_setup(const slong *id, uint n, const struct comm *comm,
1141  int unique, gs_method method, int verbose)
1142 {
1143  struct gs_data *gsh = tmalloc(struct gs_data,1);
1144  comm_dup(&gsh->comm,comm);
1145  gs_setup_aux(gsh,id,n,unique,method,verbose);
1146  return gsh;
1147 }
1148 
1149 void gs_free(struct gs_data *gsh)
1150 {
1151  comm_free(&gsh->comm);
1152  free((uint*)gsh->map_local[0]), free((uint*)gsh->map_local[1]);
1153  free((uint*)gsh->flagged_primaries);
1154  gsh->r.fin(gsh->r.data);
1155  free(gsh);
1156 }
1157 
1158 void gs_unique(slong *id, uint n, const struct comm *comm)
1159 {
1160  struct gs_topology top;
1161  struct crystal cr;
1162  crystal_init(&cr,comm);
1163  get_topology(&top, id,n, &cr);
1164  make_topology_unique(&top,id,comm->id,&cr.data);
1165  gs_topology_free(&top);
1166  crystal_free(&cr);
1167 }
1168 
1169 /*------------------------------------------------------------------------------
1170  FORTRAN interface
1171 ------------------------------------------------------------------------------*/
1172 
1173 #undef gs_op
1174 
1175 #undef gs_free
1176 #undef gs_setup
1177 #undef gs_many
1178 #undef gs_vec
1179 #undef gs
1180 #define cgs PREFIXED_NAME(gs )
1181 #define cgs_vec PREFIXED_NAME(gs_vec )
1182 #define cgs_many PREFIXED_NAME(gs_many )
1183 #define cgs_setup PREFIXED_NAME(gs_setup)
1184 #define cgs_free PREFIXED_NAME(gs_free )
1185 
1186 #define fgs_setup_pick FORTRAN_NAME(gs_setup_pick,GS_SETUP_PICK)
1187 #define fgs_setup FORTRAN_NAME(gs_setup ,GS_SETUP )
1188 #define fgs FORTRAN_NAME(gs_op ,GS_OP )
1189 #define fgs_vec FORTRAN_NAME(gs_op_vec ,GS_OP_VEC )
1190 #define fgs_many FORTRAN_NAME(gs_op_many ,GS_OP_MANY )
1191 #define fgs_fields FORTRAN_NAME(gs_op_fields ,GS_OP_FIELDS )
1192 #define fgs_free FORTRAN_NAME(gs_free ,GS_FREE )
1193 
1194 static struct gs_data **fgs_info = 0;
1195 static int fgs_max = 0;
1196 static int fgs_n = 0;
1197 
1198 void fgs_setup_pick(sint *handle, const slong id[], const sint *n,
1199  const MPI_Fint *comm, const sint *np, const sint *method)
1200 {
1201  struct gs_data *gsh;
1202  if(fgs_n==fgs_max) fgs_max+=fgs_max/2+1,
1203  fgs_info=trealloc(struct gs_data*,fgs_info,fgs_max);
1204  gsh=fgs_info[fgs_n]=tmalloc(struct gs_data,1);
1205  comm_init_check(&gsh->comm,*comm,*np);
1206  gs_setup_aux(gsh,id,*n,0,*method,1);
1207  *handle = fgs_n++;
1208 }
1209 
1210 void fgs_setup(sint *handle, const slong id[], const sint *n,
1211  const MPI_Fint *comm, const sint *np)
1212 {
1213  const sint method = gs_auto;
1214  fgs_setup_pick(handle,id,n,comm,np,&method);
1215 }
1216 
1217 static void fgs_check_handle(sint handle, const char *func, unsigned line)
1218 {
1219  if(handle<0 || handle>=fgs_n || !fgs_info[handle])
1220  fail(1,__FILE__,line,"%s: invalid handle", func);
1221 }
1222 
1223 static const gs_dom fgs_dom[4] = { 0, gs_double, gs_sint, gs_slong };
1224 
1226  const char *func, unsigned line)
1227 {
1228  if(dom<1 || dom>3)
1229  fail(1,__FILE__,line,"%s: datatype %d not in valid range 1-3",func,dom);
1230  if(op <1 || op >4)
1231  fail(1,__FILE__,line,"%s: op %d not in valid range 1-4",func,op);
1232  fgs_check_handle(handle,func,line);
1233 }
1234 
1235 void fgs(const sint *handle, void *u, const sint *dom, const sint *op,
1236  const sint *transpose)
1237 {
1238  fgs_check_parms(*handle,*dom,*op,"gs_op",__LINE__);
1239  cgs(u,fgs_dom[*dom],(gs_op_t)(*op-1),*transpose!=0,fgs_info[*handle],0);
1240 }
1241 
1242 void fgs_vec(const sint *handle, void *u, const sint *n,
1243  const sint *dom, const sint *op, const sint *transpose)
1244 {
1245  fgs_check_parms(*handle,*dom,*op,"gs_op_vec",__LINE__);
1246  cgs_vec(u,*n,fgs_dom[*dom],(gs_op_t)(*op-1),*transpose!=0,
1247  fgs_info[*handle],0);
1248 }
1249 
1250 void fgs_many(const sint *handle, void *u1, void *u2, void *u3,
1251  void *u4, void *u5, void *u6, const sint *n,
1252  const sint *dom, const sint *op, const sint *transpose)
1253 {
1254  void *uu[6];
1255  uu[0]=u1,uu[1]=u2,uu[2]=u3,uu[3]=u4,uu[4]=u5,uu[5]=u6;
1256  fgs_check_parms(*handle,*dom,*op,"gs_op_many",__LINE__);
1257  cgs_many((void *const*)uu,*n,fgs_dom[*dom],(gs_op_t)(*op-1),*transpose!=0,
1258  fgs_info[*handle],0);
1259 }
1260 
1262 
1263 void fgs_fields(const sint *handle,
1264  void *u, const sint *stride, const sint *n,
1265  const sint *dom, const sint *op, const sint *transpose)
1266 {
1267  size_t offset;
1268  void **p;
1269  uint i;
1270 
1271  fgs_check_parms(*handle,*dom,*op,"gs_op_fields",__LINE__);
1272  if(*n<0) return;
1273 
1274  array_reserve(void*,&fgs_fields_array,*n);
1275  p = fgs_fields_array.ptr;
1276  offset = *stride * gs_dom_size[*dom-1];
1277  for(i=*n;i;--i) *p++ = u, u = (char*)u + offset;
1278 
1279  cgs_many((void *const*)fgs_fields_array.ptr,*n,
1280  fgs_dom[*dom],(gs_op_t)(*op-1),
1281  *transpose!=0, fgs_info[*handle],0);
1282 }
1283 
1284 void fgs_free(const sint *handle)
1285 {
1286  fgs_check_handle(*handle,"gs_free",__LINE__);
1287  cgs_free(fgs_info[*handle]);
1288  fgs_info[*handle] = 0;
1289 }
1290 
static void pw_comm_free(struct pw_comm_data *data)
Definition: gs.c:495
#define slong
Definition: types.h:74
uint primary
Definition: gs.c:87
#define gs_slong
Definition: gs_defs.h:66
static void allreduce_exec(void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, unsigned transpose, const void *execdata, const struct comm *comm, char *buf)
Definition: gs.c:887
static void comm_barrier(const struct comm *c)
Definition: comm.h:192
Definition: gs.c:1086
uint buffer_size
Definition: gs.c:884
#define uint
Definition: types.h:70
const uint * map_local[2]
Definition: gs.c:1041
ulong total_shared
Definition: gs.c:60
static void crl_work_init(struct array *cw, struct array *sh, const unsigned send_mask, uint this_p)
Definition: gs.c:666
size_t n
Definition: mem.h:111
#define tmalloc(type, count)
Definition: mem.h:91
fin_fun * fin
Definition: gs.c:388
#define sint
Definition: types.h:69
uint * size
Definition: gs.c:400
#define fgs_setup_pick
Definition: gs.c:1186
unsigned flags
Definition: gs.c:156
#define cgs_free
Definition: gs.c:1184
struct cr_stage * stage[2]
Definition: gs.c:576
uint p1
Definition: gs.c:571
comm_req * req
Definition: gs.c:407
static void cr_exec(void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, unsigned transpose, const void *execdata, const struct comm *comm, char *buf)
Definition: gs.c:581
uint stage_buffer_size
Definition: gs.c:578
static int fgs_max
Definition: gs.c:1195
static char * pw_exec_recvs(char *buf, const unsigned unit_size, const struct comm *comm, const struct pw_comm_data *c, comm_req *req)
Definition: gs.c:411
#define gs_init_many
Definition: gs_local.c:18
void fin_fun(void *data)
Definition: gs.c:382
struct array nz
Definition: gs.c:61
#define cgs_many
Definition: gs.c:1182
const uint * gather_map
Definition: gs.c:568
exec_fun * exec
Definition: gs.c:387
#define DRY_RUN_CHECK(str, new_name)
struct gs_remote r
Definition: gs.c:1043
#define array_reserve(T, a, min)
Definition: mem.h:138
static char * pw_exec_sends(char *buf, const unsigned unit_size, const struct comm *comm, const struct pw_comm_data *c, comm_req *req)
Definition: gs.c:424
#define sarray_sort_2(T, A, n, field1, is_long1, field2, is_long2, buf)
Definition: sarray_sort.h:60
void exec_fun(void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, unsigned transpose, const void *execdata, const struct comm *comm, char *buf)
Definition: gs.c:379
static void pw_exec(void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, unsigned transpose, const void *execdata, const struct comm *comm, char *buf)
Definition: gs.c:437
static void cr_free(struct cr_data *data)
Definition: gs.c:861
n
Definition: xxt_test.m:73
uint size_r1
Definition: gs.c:569
#define gs_gather_vec
Definition: gs_local.c:13
static void comm_wait(comm_req *req, int n)
Definition: comm.h:236
uint * p
Definition: gs.c:399
uint mem_size
Definition: gs.c:385
static void pw_free(struct pw_data *data)
Definition: gs.c:544
#define CW_ADD(aid, ap, ari, asi)
uint n
Definition: gs.c:398
ulong id
Definition: gs.c:160
ulong id
Definition: gs.c:165
static int fgs_n
Definition: gs.c:1196
#define gs_many
Definition: gs.c:28
static void init_noop(void *out, const unsigned vn, const uint *map, gs_dom dom, gs_op op)
Definition: gs.c:50
void gs_gather_fun(void *out, const void *in, const unsigned vn, const uint *map, gs_dom dom, gs_op op)
Definition: gs_local.h:27
static uint cr_schedule(struct cr_data *data, const struct comm *comm)
Definition: gs.c:630
#define crystal_free
Definition: crystal.c:47
uint handle_size
Definition: gs.c:1044
void comm_allreduce(const struct comm *com, gs_dom dom, gs_op op, void *v, uint vn, void *buf)
Definition: comm.c:107
#define gs
Definition: gs.c:26
uint ri
Definition: gs.c:662
Definition: gs.c:155
static void pw_setup(struct gs_remote *r, struct gs_topology *top, const struct comm *comm, buffer *buf)
Definition: gs.c:554
const uint * map_to_buf[2]
Definition: gs.c:883
Definition: gs.c:86
#define crystal_init
Definition: crystal.c:46
static void nonzero_ids(struct array *nz, const slong *id, const uint n, buffer *buf)
Definition: gs.c:90
#define gs_vec
Definition: gs.c:27
#define trealloc(type, ptr, count)
Definition: mem.h:95
#define DRY_RUN(i, gsr, str)
uint si
Definition: gs.c:662
#define gs_scatter
Definition: gs_local.c:11
#define gs_sint
Definition: gs_defs.h:65
static ulong shared_ids(struct array *sh, struct array *pr, const struct array *nz, struct crystal *cr)
Definition: gs.c:193
#define fgs_vec
Definition: gs.c:1189
struct array sh
Definition: gs.c:63
static void gs_topology_free(struct gs_topology *top)
Definition: gs.c:67
static double comm_time(void)
Definition: comm.h:183
static buffer static_buffer
Definition: gs.c:38
struct pw_comm_data comm[2]
Definition: gs.c:405
#define gs_gather_many
Definition: gs_local.c:16
Definition: comm.h:85
#define fgs_free
Definition: gs.c:1192
#define gs_setup
Definition: gs.c:29
Definition: gs.c:404
uint flag
Definition: gs.c:87
static void comm_free(struct comm *c)
Definition: comm.h:176
void comm_scan(void *scan, const struct comm *com, gs_dom dom, gs_op op, const void *v, uint vn, void *buffer)
Definition: comm.c:101
#define FLAGS_REMOTE
Definition: gs.c:149
#define gs_free
Definition: gs.c:30
static void crl_bi_to_si(struct crl_id *w, uint n, uint v)
Definition: gs.c:760
gs_op
Definition: gs_defs.h:77
static const uint * allreduce_map_setup(struct array *pr, const unsigned flags_mask, int to_buf, uint *mem_size)
Definition: gs.c:912
static struct pw_data * pw_setup_aux(struct array *sh, buffer *buf, uint *mem_size)
Definition: gs.c:524
buffer data
Definition: crystal.c:52
#define gs_init
Definition: gs_local.c:12
static struct allreduce_data * allreduce_setup_aux(struct array *pr, ulong total_shared, uint *mem_size)
Definition: gs.c:933
int comm_req
Definition: comm.h:70
#define cgs
Definition: gs.c:1180
uint total
Definition: gs.c:401
Definition: gs.c:35
gs_dom
Definition: gs_defs.h:61
static const gs_dom fgs_dom[4]
Definition: gs.c:1223
uint size_r2
Definition: gs.c:569
uint size_s
Definition: gs.c:570
ulong id
Definition: gs.c:120
#define fgs_fields
Definition: gs.c:1191
static void unique_ids(struct array *un, const struct array *nz, const uint np)
Definition: gs.c:121
#define fgs_setup
Definition: gs.c:1187
#define buffer_reserve(b, max)
Definition: mem.h:157
ulong id
Definition: gs.c:662
static struct crystal cr
Definition: findpts_test.c:75
#define gs_scatter_many
Definition: gs_local.c:17
Definition: gs.c:120
void * data
Definition: gs.c:386
#define gs_scatter_vec
Definition: gs_local.c:14
static void auto_setup(struct gs_remote *r, struct gs_topology *top, const struct comm *comm, buffer *buf)
Definition: gs.c:993
p
Definition: xxt_test2.m:1
#define fgs_many
Definition: gs.c:1190
static void fgs_check_handle(sint handle, const char *func, unsigned line)
Definition: gs.c:1217
const gs_dom dom
Definition: gs_test.c:15
uint i2f
Definition: gs.c:165
#define comm_init_check(c, ce, np)
Definition: comm.h:161
uint work_proc
Definition: gs.c:120
static void dry_run_time(double times[3], const struct gs_remote *r, const struct comm *comm, buffer *buf)
Definition: gs.c:975
uint np
Definition: comm.h:86
uint size_r
Definition: gs.c:569
static uint crl_work_label(struct array *cw, struct cr_stage *stage, uint cutoff, int send_hi, buffer *buf, uint *mem_size)
Definition: gs.c:726
static void allreduce_free(struct allreduce_data *ard)
Definition: gs.c:951
#define FLAGS_LOCAL
Definition: gs.c:148
Definition: mem.h:111
static uint pw_comm_setup(struct pw_comm_data *data, struct array *sh, const unsigned flags_mask, buffer *buf)
Definition: gs.c:464
size_t max
Definition: mem.h:111
static void allreduce_setup(struct gs_remote *r, struct gs_topology *top, const struct comm *comm, buffer *buf)
Definition: gs.c:960
static void gather_noop(void *out, const void *in, const unsigned vn, const uint *map, gs_dom dom, gs_op op)
Definition: gs.c:40
#define iabsl
Definition: types.h:76
#define ulong
Definition: types.h:75
uint p1
Definition: gs.c:165
#define DO_SET(cond)
uint i1f
Definition: gs.c:165
uint buffer_size
Definition: gs.c:385
const uint * map_from_buf[2]
Definition: gs.c:883
uint size_sk
Definition: gs.c:570
#define array_init(T, a, max)
Definition: mem.h:136
uint p
Definition: gs.c:662
static struct gs_data ** fgs_info
Definition: gs.c:1194
uint ri
Definition: gs.c:156
#define gs_gather
Definition: gs_local.c:10
#define array_resize(T, a, max)
Definition: mem.h:137
uint buffer_size
Definition: gs.c:408
Definition: gs.c:575
static void get_topology(struct gs_topology *top, const slong *id, uint n, struct crystal *cr)
Definition: gs.c:249
uint i
Definition: gs.c:87
#define fgs
Definition: gs.c:1188
uint p2
Definition: gs.c:165
Definition: gs.c:384
#define sarray_sort(T, A, n, field, is_long, buf)
Definition: sarray_sort.h:55
ulong id
Definition: gs.c:156
uint send
Definition: gs.c:662
uint p2
Definition: gs.c:571
for i
Definition: xxt_test.m:74
struct array pr
Definition: gs.c:64
ulong id
Definition: gs.c:87
#define gs_scatter_vec_to_many
Definition: gs_local.c:21
uint bi
Definition: gs.c:156
#define null_array
Definition: mem.h:112
static void comm_irecv(comm_req *req, const struct comm *c, void *p, size_t n, uint src, int tag)
Definition: comm.h:220
gs_mode
Definition: gs.c:35
static int flag
Definition: byte.c:52
void * ptr
Definition: mem.h:111
static void gs_setup_aux(struct gs_data *gsh, const slong *id, uint n, int unique, gs_method method, int verbose)
Definition: gs.c:1097
const uint * map[2]
Definition: gs.c:406
static uint cr_learn(struct array *cw, struct cr_stage *stage, const struct comm *comm, buffer *buf, uint *mem_size)
Definition: gs.c:768
Definition: gs.c:59
Definition: gs.c:1039
unsigned nrecvn
Definition: gs.c:572
gs_method
Definition: gs.c:1086
static uint local_setup(struct gs_data *gsh, const struct array *nz)
Definition: gs.c:1088
static uint id
Definition: findpts_test.c:63
void gs_scatter_fun(void *out, const void *in, const unsigned vn, const uint *map, gs_dom dom)
Definition: gs_local.h:30
static void scatter_noop(void *out, const void *in, const unsigned vn, const uint *map, gs_dom dom)
Definition: gs.c:45
int MPI_Fint
Definition: comm.h:71
const uint * scatter_map
Definition: gs.c:568
static void shared_ids_aux(struct array *sh, struct array *pr, uint pr_n, struct array *wa, buffer *buf)
Definition: gs.c:166
unsigned flag
Definition: gs.c:160
#define array_free(a)
Definition: mem.h:135
uint src_if
Definition: gs.c:120
uint id
Definition: comm.h:86
Definition: gs.c:567
#define null_buffer
Definition: mem.h:154
#define comm_dup(d, s)
Definition: comm.h:174
static void crl_ri_to_bi(struct crl_id *w, uint n)
Definition: gs.c:764
uint p
Definition: gs.c:156
static uint crl_maps(struct cr_stage *stage, struct array *cw, buffer *buf)
Definition: gs.c:697
ulong out[N]
Definition: sort_test2.c:20
#define DO_COUNT(cond)
void gs_init_fun(void *out, const unsigned vn, const uint *map, gs_dom dom, gs_op op)
Definition: gs_local.h:33
const uint * flagged_primaries
Definition: gs.c:1042
unsigned nstages
Definition: gs.c:577
#define GS_DEFINE_DOM_SIZES()
Definition: gs_defs.h:70
Definition: gs.c:661
#define sarray_transfer(T, A, proc_field, set_src, cr)
se
Definition: xxt_test.m:77
static struct array fgs_fields_array
Definition: gs.c:1261
static struct cr_data * cr_setup_aux(struct array *sh, const struct comm *comm, buffer *buf, uint *mem_size)
Definition: gs.c:824
establishes some macros to establish naming conventions
#define gs_init_vec
Definition: gs_local.c:15
static const uint * pw_map_setup(struct array *sh, buffer *buf, uint *mem_size)
Definition: gs.c:498
static void cr_free_stage_maps(struct cr_stage *stage, unsigned kmax)
Definition: gs.c:851
uint size_total
Definition: gs.c:570
static uint np
Definition: findpts_test.c:63
ulong ord
Definition: gs.c:160
Definition: gs.c:35
static void fgs_check_parms(sint handle, sint dom, sint op, const char *func, unsigned line)
Definition: gs.c:1225
struct comm comm
Definition: gs.c:1040
uint buffer_size
Definition: gs.c:578
#define gs_gather_vec_to_many
Definition: gs_local.c:19
static void make_topology_unique(struct gs_topology *top, slong *id, uint pid, buffer *buf)
Definition: gs.c:256
static void cr_setup(struct gs_remote *r, struct gs_topology *top, const struct comm *comm, buffer *buf)
Definition: gs.c:869
const uint nz[3]
Definition: xxt_test.c:78
void setup_fun(struct gs_remote *r, struct gs_topology *top, const struct comm *comm, buffer *buf)
Definition: gs.c:391
Definition: gs.c:35
static void comm_isend(comm_req *req, const struct comm *c, void *p, size_t n, uint dst, int tag)
Definition: comm.h:228
#define gs_unique
Definition: gs.c:31
static char name[MAX_NAME+1]
Definition: byte.c:53
struct comm comm
Definition: crystal.c:51
#define gs_init_array
Definition: gs_local.c:9
static void gs_aux(void *u, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, unsigned transpose, struct gs_data *gsh, buffer *buf)
Definition: gs.c:1047
static const uint * flagged_primaries_map(const struct array *nz, uint *mem_size)
Definition: gs.c:362
static const uint * local_map(const struct array *nz, const int ignore_flagged, uint *mem_size)
Definition: gs.c:332
void fail(int status, const char *file, unsigned line, const char *fmt,...)
Definition: fail.c:47
uint i
Definition: gs.c:156
ulong ord
Definition: gs.c:165
#define gs_scatter_many_to_vec
Definition: gs_local.c:20
#define cgs_vec
Definition: gs.c:1181
uint bi
Definition: gs.c:662