Nek5000
SEM for Incompressible NS
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Modules Pages
xxt.c
Go to the documentation of this file.
1 #include <stddef.h>
2 #include <stdlib.h>
3 #include <stdio.h>
4 #include <limits.h>
5 #include <float.h>
6 #include <string.h>
7 #include <math.h>
8 #include "c99.h"
9 #include "name.h"
10 #include "fail.h"
11 #include "types.h"
12 #include "tensor.h"
13 #include "gs_defs.h"
14 #include "comm.h"
15 #include "mem.h"
16 #include "sort.h"
17 #include "sarray_sort.h"
18 #include "sparse_cholesky.h"
19 #include "gs.h"
20 
21 #define crs_setup PREFIXED_NAME(crs_setup)
22 #define crs_solve PREFIXED_NAME(crs_solve)
23 #define crs_stats PREFIXED_NAME(crs_stats)
24 #define crs_free PREFIXED_NAME(crs_free )
25 
26 /*
27  portable log base 2
28 
29  does a binary search to find leading order bit
30 
31  UINT_BITS = number of bits in a uint
32  BITS(0) = UINT_BITS
33  BITS(i) = half of BITS(i-1), rounded up
34  MASK(i) = bitmask with BITS(i) 1's followed by BITS(i) 0's
35 */
36 
37 static unsigned lg(uint v)
38 {
39  unsigned r = 0;
40 #define UINT_BITS (sizeof(uint)*CHAR_BIT)
41 #define BITS(i) ((UINT_BITS+(1<<(i))-1)>>(i))
42 #define MASK(i) ((((uint)1<<BITS(i)) - 1) << BITS(i))
43 #define CHECK(i) if((BITS(i)!=1) && (v&MASK(i))) v>>=BITS(i), r+=BITS(i)
44  CHECK(1); CHECK(2); CHECK(3); CHECK(4); CHECK(5); CHECK(6); CHECK(7);
45  CHECK(8); CHECK(9); /* this covers up to 1024-bit uints */
46  if(v&2) ++r;
47  return r;
48 #undef UINT_BITS
49 #undef BITS
50 #undef MASK
51 #undef CHECK
52 }
53 
54 struct csr_mat {
55  uint n, *Arp, *Aj; double *A;
56 };
57 
58 struct xxt {
59 
60  /* communication */
61 
62  struct comm comm;
63  uint pcoord; /* coordinate in communication tree */
64  unsigned plevels; /* # of stages of communication */
65  sint *pother; /* let p = pother[i], then during stage i of fan-in,
66  if p>=0, receive from p
67  if p< 0, send to (-p-1)
68  fan-out is just the reverse ...
69  on proc 0, pother is never negative
70  on others, pother is negative for the last stage only */
72 
73  /* separators */
74 
75  unsigned nsep; /* number of separators */
76  uint *sep_size; /* # of dofs on each separator,
77  ordered from the bottom to the top of the tree:
78  separator 0 is the bottom-most one (dofs not shared)
79  separator nsep-1 is the root of the tree */
80 
81  unsigned null_space;
82  double *share_weight;
83 
84  /* vector sizes */
85 
86  uint un; /* user's vector size */
87 
88  /* xxt_solve works with "condensed" vectors;
89  same dofs as in user's vectors, but no duplicates and no Dirichlet nodes,
90  and also ordered topologically (children before parents) according to the
91  separator tree */
92 
93  uint cn; /* size of condensed vectors */
94  sint *perm_u2c; /* permutation from user vector to condensed vector,
95  p=perm_u2c[i]; xu[i] = p=-1 ? 0 : xc[p]; */
96  uint ln, sn; /* xc[0 ... ln-1] are not shared (ln=sep_size[0])
97  xc[ln ... ln+sn-1] are shared
98  ln+sn = cn */
99 
100  uint xn; /* # of columns of x = sum_i(sep_size[i]) - sep_size[0] */
101 
102  /* data */
104  struct csr_mat A_sl;
105  uint *Xp; double *X; /* column i of X starts at X[Xp[i]] */
106 
107  /* execution buffers */
108  double *vl, *vc, *vx, *combuf;
109 };
110 
111 
112 /*
113  for the binary communication tree, the procs are divided in half
114  at each level, with the second half always the larger
115 
116  e.g., for np = 13:
117 
118  +------13-------+
119  | |
120  +---6---+ +---7---+
121  | | | |
122  +-3-+ +-3-+ +-3-+ +-4-+
123  1 2 1 2 1 2 2 2
124  1^1 1^1 1^1 1^1 1^1
125 
126  plevels is the number of levels in the tree
127  = np==1 ? 1 : ( lg(np-1)+2 )
128 
129  labelling the nodes with proc id's gives this communication tree:
130 
131  +-------0-------+
132  | |
133  +---0---+ +---6---+
134  | | | |
135  +-0-+ +-3-+ +-6-+ +-9-+
136  0 1 3 4 6 7 9 b
137  1^2 4^5 7^8 9^a b^c
138 
139  consider proc 7 (pid = 7);
140  pcoord gives the position of the leaf labelled 7:
141  Root Right Left Right Left -> RRLRL -> 11010
142  so pcoord = 11010 binary
143  note the parent coordinate can be found by bit shifting right
144  (i.e. dividing by 2)
145 */
146 
147 /* sets: pcoord, nsep, plevels, pother, req */
148 static void locate_proc(struct xxt *data)
149 {
150  const uint id = data->comm.id;
151  uint n = data->comm.np, c=1, odd=0, base=0;
152  unsigned level=0;
153  while(n>1) {
154  ++level;
155  odd=(odd<<1)|(n&1);
156  c<<=1, n>>=1;
157  if(id>=base+n) c|=1, base+=n, n+=(odd&1);
158  }
159  data->pcoord=c;
160  data->nsep = level+1;
161  data->plevels = data->nsep-1;
162  data->pother = tmalloc(sint,data->plevels);
163  data->req = tmalloc(comm_req,data->plevels);
164  for(level=0;level<data->plevels;++level) {
165  if((c&1)==1) {
166  uint targ = id - (n-(odd&1));
167  data->pother[level]=-(sint)(targ+1);
168  data->plevels = level+1;
169  break;
170  } else {
171  data->pother[level]=id+n;
172  c>>=1, n=(n<<1)+(odd&1), odd>>=1;
173  }
174  }
175 }
176 
177 /* the tuple list describing the condensed dofs:
178  [(separator level, share count, global id)] */
179 struct dof { ulong id; uint level, count; };
180 
181 /* determine the size of each separator;
182  sums the separator sizes following the fan-in, fan-out comm. pattern
183  uses the share-counts to avoid counting dofs more than once */
184 /* sets: xn, sep_size, ln, sn */
185 static void discover_sep_sizes(struct xxt *data,
186  struct array *dofa, buffer *buf)
187 {
188  const unsigned ns=data->nsep, nl=data->plevels;
189  const uint n = dofa->n;
190  float *v, *recv;
191  unsigned i,lvl; uint j;
192  const struct dof *dof = dofa->ptr;
193 
194  buffer_reserve(buf,2*ns*sizeof(float));
195  v=buf->ptr, recv=v+ns;
196 
197  for(i=0;i<ns;++i) v[i]=0;
198  for(j=0;j<n;++j) v[dof[j].level]+=1/(float)dof[j].count;
199 
200  /* fan-in */
201  for(lvl=0;lvl<nl;++lvl) {
202  sint other = data->pother[lvl];
203  unsigned s = ns-(lvl+1);
204  if(other<0) {
205  comm_send(&data->comm,v +lvl+1,s*sizeof(float),-other-1,s);
206  } else {
207  comm_recv(&data->comm,recv+lvl+1,s*sizeof(float),other,s);
208  for(i=lvl+1;i<ns;++i) v[i]+=recv[i];
209  }
210  }
211  /* fan-out */
212  for(;lvl;) {
213  sint other = data->pother[--lvl];
214  unsigned s = ns-(lvl+1);
215  if(other<0)
216  comm_recv(&data->comm,v+lvl+1,s*sizeof(float),-other-1,s);
217  else
218  comm_send(&data->comm,v+lvl+1,s*sizeof(float),other,s);
219  }
220 
221  data->xn=0;
222  data->sep_size = tmalloc(uint,ns);
223  for(i=0;i<ns;++i) {
224  uint s=v[i]+.1f;
225  data->sep_size[i]=s;
226  data->xn+=s;
227  }
228  data->ln=data->sep_size[0];
229  data->sn=data->cn-data->ln;
230  data->xn-=data->ln;
231 }
232 
233 /* assuming [A,Aend) is sorted,
234  removes 0's and any duplicate entries,
235  returns new end */
236 static ulong *unique_nonzero(ulong *A, ulong *Aend)
237 {
238  if(Aend==A) return A;
239  else {
240  ulong *end = Aend-1, last=*end, *p=A,*q=A,v=0;
241  *end = 1;
242  while(*q==0) ++q; /* *q==0 => q!=end since *end==0 */
243  *end = 0;
244  while(q!=end) {
245  v=*q++, *p++=v;
246  while(*q==v) ++q; /* *q==v => q!=end since *end==0 */
247  }
248  if(last!=v) *p++=last;
249  return p;
250  }
251 }
252 
253 static void merge_sep_ids(struct xxt *data, ulong *sep_id, ulong *other,
254  ulong *work, unsigned s0, buffer *buf)
255 {
256  const unsigned ns = data->nsep;
257  unsigned s;
258  ulong *p=sep_id, *q=other;
259  for(s=s0;s<ns;++s) {
260  ulong *end;
261  uint size = data->sep_size[s];
262  memcpy(work ,p,size*sizeof(ulong));
263  memcpy(work+size,q,size*sizeof(ulong));
264  sortv_long(work, work,2*size,sizeof(ulong), buf);
265  end = unique_nonzero(work,work+2*size);
266  memcpy(p,work,(end-work)*sizeof(ulong));
267  p+=size, q+=size;
268  }
269 }
270 
271 static void init_sep_ids(struct xxt *data, struct array *dofa, ulong *xid)
272 {
273  const unsigned ns=data->nsep;
274  const uint n=data->cn, *sep_size=data->sep_size;
275  unsigned s=1;
276  uint i, size;
277  const struct dof *dof = dofa->ptr;
278  if(ns==1) return;
279  size=sep_size[s];
280  for(i=data->ln;i<n;++i) {
281  unsigned si = dof[i].level;
282  while(s!=si) {
283  memset(xid,0,size*sizeof(ulong));
284  xid+=size;
285  if(++s != ns) size=data->sep_size[s];
286  }
287  *xid++ = dof[i].id, --size;
288  }
289  while(s!=ns) {
290  memset(xid,0,size*sizeof(ulong));
291  xid+=size;
292  if(++s != ns) size=data->sep_size[s];
293  }
294 }
295 
296 static void find_perm_x2c(uint ln, uint cn, const struct array *dofc,
297  uint xn, const ulong *xid, sint *perm)
298 {
299  const struct dof *dof = dofc->ptr, *dof_end = dof+cn;
300  const ulong *xid_end = xid+xn; uint i=ln;
301  dof+=ln;
302  while(dof!=dof_end) {
303  ulong v=dof->id;
304  while(*xid!=v) ++xid, *perm++ = -1;
305  *perm++ = i++, ++dof, ++xid;
306  }
307  while(xid!=xid_end) ++xid, *perm++ = -1;
308 }
309 
310 /* sets: perm_x2c */
311 static sint *discover_sep_ids(struct xxt *data, struct array *dofa, buffer *buf)
312 {
313  const unsigned ns=data->nsep, nl=data->plevels;
314  const uint xn=data->xn, *sep_size=data->sep_size;
315  ulong *xid, *recv, *work, *p;
316  unsigned lvl;
317  uint size,ss;
318  sint *perm_x2c;
319 
320  size=0; for(lvl=1;lvl<ns;++lvl) if(sep_size[lvl]>size) size=sep_size[lvl];
321  xid=tmalloc(ulong,2*xn+2*size), recv=xid+xn, work=recv+xn;
322 
323  init_sep_ids(data,dofa,xid);
324 
325  if(nl) {
326  /* fan-in */
327  p=xid, size=xn;
328  for(lvl=0;lvl<nl;++lvl) {
329  sint other = data->pother[lvl];
330  if(other<0) {
331  comm_send(&data->comm,p ,size*sizeof(ulong),-other-1,size);
332  } else {
333  comm_recv(&data->comm,recv,size*sizeof(ulong),other,size);
334  merge_sep_ids(data,p,recv,work,lvl+1,buf);
335  }
336  ss=data->sep_size[lvl+1];
337  if(ss>=size || lvl==nl-1) break;
338  p+=ss, size-=ss;
339  }
340  /* fan-out */
341  for(;;) {
342  sint other = data->pother[lvl];
343  if(other<0)
344  comm_recv(&data->comm,p,size*sizeof(ulong),-other-1,size);
345  else
346  comm_send(&data->comm,p,size*sizeof(ulong),other,size);
347  if(lvl==0) break;
348  ss=data->sep_size[lvl];
349  p-=ss, size+=ss, --lvl;
350  }
351  }
352 
353  perm_x2c=tmalloc(sint,xn);
354  find_perm_x2c(data->ln,data->cn,dofa, xn,xid, perm_x2c);
355  free(xid);
356 
357  return perm_x2c;
358 }
359 
360 static void apply_QQt(struct xxt *data, double *v, uint n, uint tag)
361 {
362  const unsigned nl=data->plevels;
363  double *p=v, *recv=data->combuf;
364  unsigned lvl, nsend=0;
365  uint size=n, ss;
366 
367  if(n==0 || nl==0) return;
368 
369  tag=tag*2+0;
370  /* fan-in */
371  for(lvl=0;lvl<nl;++lvl) {
372  sint other = data->pother[lvl];
373  if(other<0) {
374  comm_send(&data->comm,p ,size*sizeof(double),-other-1,tag);
375  } else {
376  uint i;
377  comm_recv(&data->comm,recv,size*sizeof(double),other ,tag);
378  for(i=0;i<size;++i) p[i]+=recv[i];
379  }
380  ss=data->sep_size[lvl+1];
381  if(ss>=size || lvl==nl-1) break;
382  p+=ss, size-=ss;
383  }
384  /* fan-out */
385  for(;;) {
386  sint other = data->pother[lvl];
387  if(other<0) {
388  comm_recv (&data->comm,p,size*sizeof(double),-other-1,tag);
389  } else {
390  comm_isend(&data->req[nsend++],&data->comm,
391  p,size*sizeof(double),other ,tag);
392  }
393  if(lvl==0) break;
394  ss=data->sep_size[lvl];
395  p-=ss, size+=ss, --lvl;
396  }
397  if(nsend) comm_wait(data->req,nsend);
398 }
399 
400 static double sum(struct xxt *data, double v, uint n, uint tag)
401 {
402  const unsigned nl=data->plevels;
403  double r;
404  unsigned lvl,nsend=0;
405  uint size=n, ss;
406 
407  tag=tag*2+1;
408  if(n==0 || nl==0) return v;
409  /* fan-in */
410  for(lvl=0;lvl<nl;++lvl) {
411  sint other = data->pother[lvl];
412  if(other<0) {
413  comm_send(&data->comm,&v,sizeof(double),-other-1,tag);
414  } else {
415  comm_recv(&data->comm,&r,sizeof(double),other ,tag);
416  v+=r;
417  }
418  ss=data->sep_size[lvl+1];
419  if(ss>=size || lvl==nl-1) break;
420  size-=ss;
421  }
422  /* fan-out */
423  for(;;) {
424  sint other = data->pother[lvl];
425  if(other<0) {
426  comm_recv (&data->comm,&v,sizeof(double),-other-1,tag);
427  } else {
428  comm_isend(&data->req[nsend++],&data->comm,
429  &v,sizeof(double),other ,tag);
430  }
431  if(lvl==0) break;
432  ss=data->sep_size[lvl];
433  size+=ss, --lvl;
434  }
435  if(nsend) comm_wait(data->req,nsend);
436  return v;
437 }
438 
439 /* sorts an array of ids, removes 0's and duplicates;
440  just returns the permutation */
441 static uint unique_ids(uint n, const ulong *id, sint *perm, buffer *buf)
442 {
443  uint *p, i, un=0; ulong last=0;
444  p = sortp_long(buf,0, id,n,sizeof(ulong));
445  for(i=0;i<n;++i) {
446  uint j = p[i]; ulong v = id[j];
447  if(v==0) perm[j]=-1;
448  else {
449  if(v!=last) last=v, ++un;
450  perm[j]=un-1;
451  }
452  }
453  buf->n=0;
454  return un;
455 }
456 
457 /* given user's list of dofs (as id's)
458  uses gather-scatter to find share-count and separator # for each
459  outputs as a list, sorted topologically (children before parents)
460  according to the sep. tree (and without duplicates),
461  as well as the permutation to get there from the user's list */
462 /* sets: un, cn, perm_u2c */
463 static void discover_dofs(
464  struct xxt *data, uint n, const ulong *id,
465  struct array *dofa, buffer *buf, const struct comm *comm)
466 {
467  const uint pcoord = data->pcoord, ns=data->nsep;
468  sint *perm;
469  uint i, cn, *p, *pi;
470  ulong *bid;
471  struct gs_data *gsh; sint *v;
472  struct dof *dof;
473 
474  data->un = n;
475  data->perm_u2c = perm = tmalloc(sint,n);
476  data->cn = cn = unique_ids(n,id,perm,buf);
477  array_init(struct dof,dofa,cn), dofa->n=cn, dof=dofa->ptr;
478  buffer_reserve(buf,cn*sizeof(ulong)), bid=buf->ptr;
479  for(i=0;i<n;++i) if(perm[i]>=0) bid[perm[i]]=dof[perm[i]].id=id[i];
480 
481  gsh = gs_setup((const slong*)bid,cn,comm,0,gs_crystal_router,0);
482  v = tmalloc(sint,cn);
483 
484  for(i=0;i<cn;++i) v[i]=pcoord;
485  gs(v,gs_sint,gs_bpr,0,gsh,buf);
486  for(i=0;i<cn;++i) dof[i].level=ns-1-lg((uint)v[i]);
487 
488  for(i=0;i<cn;++i) v[i]=1;
489  gs(v,gs_sint,gs_add,0,gsh,buf);
490  for(i=0;i<cn;++i) dof[i].count=v[i];
491 
492  free(v);
493  gs_free(gsh);
494 
495  if(!cn) return;
496  buffer_reserve(buf,2*cn*sizeof(uint));
497  p = sortp(buf,0, &dof[0].level,cn,sizeof(struct dof));
498  pi = p+cn; for(i=0;i<cn;++i) pi[p[i]]=i;
499  for(i=0;i<n;++i) if(perm[i]>=0) perm[i]=pi[perm[i]];
500  sarray_permute_buf(struct dof,dof,cn, buf);
501 }
502 
503 /* vl += A_ls * vs */
504 static void apply_p_Als(double *vl, struct xxt *data, const double *vs, uint ns)
505 {
506  const uint *Arp = data->A_sl.Arp,
507  *Aj = data->A_sl.Aj;
508  const double *A = data->A_sl.A;
509  uint i,p,pe;
510  for(i=0;i<ns;++i)
511  for(p=Arp[i],pe=Arp[i+1];p!=pe;++p)
512  vl[Aj[p]]+=A[p]*vs[i];
513 }
514 
515 /* vs -= A_sl * vl */
516 static void apply_m_Asl(double *vs, uint ns, struct xxt *data, const double *vl)
517 {
518  const uint *Arp = data->A_sl.Arp,
519  *Aj = data->A_sl.Aj;
520  const double *A = data->A_sl.A;
521  uint i,p,pe;
522  for(i=0;i<ns;++i)
523  for(p=Arp[i],pe=Arp[i+1];p!=pe;++p)
524  vs[i]-=A[p]*vl[Aj[p]];
525 }
526 
527 /* returns a column of S : vs = -S(0:ei-1,ei) */
528 static void apply_S_col(double *vs, struct xxt *data,
529  struct csr_mat *A_ss, uint ei, double *vl)
530 {
531  const uint ln=data->ln;
532  const uint *Asl_rp = data->A_sl.Arp, *Ass_rp = A_ss->Arp,
533  *Asl_j = data->A_sl.Aj, *Ass_j = A_ss->Aj;
534  const double *Asl = data->A_sl.A, *Ass = A_ss->A;
535  uint i,p,pe;
536  for(i=0;i<ei;++i) vs[i]=0;
537  for(p=Ass_rp[ei],pe=Ass_rp[ei+1];p!=pe;++p) {
538  uint j=Ass_j[p];
539  if(j>=ei) break;
540  vs[j]=-Ass[p];
541  }
542  for(i=0;i<ln;++i) vl[i]=0;
543  for(p=Asl_rp[ei],pe=Asl_rp[ei+1];p!=pe;++p) vl[Asl_j[p]]=-Asl[p];
544  sparse_cholesky_solve(vl,&data->fac_A_ll,vl);
545  apply_m_Asl(vs,ei,data,vl);
546 }
547 
548 static void apply_S(double *Svs, uint ns, struct xxt *data,
549  struct csr_mat *A_ss, const double *vs, double *vl)
550 {
551  const uint ln=data->ln;
552  const uint *Ass_rp = A_ss->Arp,
553  *Ass_j = A_ss->Aj;
554  const double *Ass = A_ss->A;
555  uint i, p,pe;
556  for(i=0;i<ns;++i) {
557  double sum=0;
558  for(p=Ass_rp[i],pe=Ass_rp[i+1];p!=pe;++p) {
559  uint j=Ass_j[p];
560  if(j>=ns) break;
561  sum+=Ass[p]*vs[j];
562  }
563  Svs[i]=sum;
564  }
565  for(i=0;i<ln;++i) vl[i]=0;
566  apply_p_Als(vl,data,vs,ns);
567  sparse_cholesky_solve(vl,&data->fac_A_ll,vl);
568  apply_m_Asl(Svs,ns,data,vl);
569 }
570 
571 /* vx = X' * vs */
572 static void apply_Xt(double *vx, uint nx, const struct xxt *data,
573  const double *vs)
574 {
575  const double *X = data->X; const uint *Xp = data->Xp;
576  uint i; for(i=0;i<nx;++i) vx[i]=tensor_dot(vs,X+Xp[i],Xp[i+1]-Xp[i]);
577 }
578 
579 /* vs = X * vx */
580 static void apply_X(double *vs, uint ns, const struct xxt *data,
581  const double *vx, uint nx)
582 {
583  const double *X = data->X; const uint *Xp = data->Xp;
584  uint i,j;
585  for(i=0;i<ns;++i) vs[i]=0;
586  for(i=0;i<nx;++i) {
587  const double v = vx[i];
588  const double *x = X+Xp[i]; uint n=Xp[i+1]-Xp[i];
589  for(j=0;j<n;++j) vs[j]+=x[j]*v;
590  }
591 }
592 
593 static void allocate_X(struct xxt *data, sint *perm_x2c)
594 {
595  uint xn=data->xn;
596  uint i,h=0;
597  if(data->null_space && xn) --xn;
598  data->Xp = tmalloc(uint,xn+1);
599  data->Xp[0]=0;
600  for(i=0;i<xn;++i) {
601  if(perm_x2c[i]!=-1) ++h;
602  data->Xp[i+1]=data->Xp[i]+h;
603  }
604  data->X = tmalloc(double,data->Xp[xn]);
605 }
606 
607 static void orthogonalize(struct xxt *data, struct csr_mat *A_ss,
608  sint *perm_x2c, buffer *buf)
609 {
610  uint ln=data->ln, sn=data->sn, xn=data->xn;
611  double *vl, *vs, *vx, *Svs;
612  uint i,j;
613 
614  allocate_X(data,perm_x2c);
615 
616  buffer_reserve(buf,(ln+2*sn+xn)*sizeof(double));
617  vl=buf->ptr, vs=vl+ln, Svs=vs+sn, vx=Svs+sn;
618 
619  if(data->null_space && xn) --xn;
620  for(i=0;i<xn;++i) {
621  uint ns=data->Xp[i+1]-data->Xp[i];
622  sint ui = perm_x2c[i];
623  double ytsy, *x;
624 
625  if(ui == -1) {
626  for(j=0;j<i;++j) vx[j]=0;
627  } else {
628  ui-=ln;
629  apply_S_col(vs, data,A_ss, ui, vl);
630  apply_Xt(vx,i, data, vs);
631  }
632  apply_QQt(data,vx,i,xn-i);
633  apply_X(vs,ns, data, vx,i);
634  if(ui!=-1) vs[ui]=1;
635  apply_S(Svs,ns, data,A_ss, vs, vl);
636  ytsy = tensor_dot(vs,Svs,ns);
637  ytsy = sum(data,ytsy,i+1,xn-(i+1));
638  if(ytsy<DBL_EPSILON/128) ytsy=0; else ytsy = 1/sqrt(ytsy);
639  x=&data->X[data->Xp[i]];
640  for(j=0;j<ns;++j) x[j]=ytsy*vs[j];
641  }
642 }
643 
644 struct yale_mat { uint i,j; double v; };
645 
646 /* produces CSR matrix from Yale-like format, summing duplicates */
647 static void condense_matrix(struct array *mat, uint nr,
648  struct csr_mat *out, buffer *buf)
649 {
650  uint k, nz=mat->n;
651  struct yale_mat *p, *q;
652  sarray_sort_2(struct yale_mat,mat->ptr,mat->n, i,0, j,0, buf);
653 
654  p = mat->ptr;
655  for(k=0;k+1<nz;++k,++p) if(p[0].i==p[1].i && p[0].j==p[1].j) break;
656  if(++k<nz) {
657  uint i=p->i,j=p->j;
658  q = p+1;
659  for(;k<nz;++k,++q) {
660  if(i==q->i&&j==q->j) p->v += q->v, --mat->n;
661  else ++p, p->i=i=q->i,p->j=j=q->j, p->v=q->v;
662  }
663  }
664 
665  nz=mat->n;
666  out->n=nr;
667  out->Arp = tmalloc(uint,nr+1+mat->n);
668  out->Aj = out->Arp+nr+1;
669  out->A = tmalloc(double,mat->n);
670  for(k=0;k<nr;++k) out->Arp[k]=0;
671  for(p=mat->ptr,k=0;k<nz;++k,++p)
672  out->Arp[p->i]++, out->Aj[k]=p->j, out->A[k]=p->v;
673  nz=0; for(k=0;k<=nr;++k) { uint t=out->Arp[k]; out->Arp[k]=nz, nz+=t; }
674 }
675 
676 static void separate_matrix(
677  uint nz, const uint *Ai, const uint *Aj, const double *A,
678  const sint *perm, uint ln, uint sn,
679  struct csr_mat *out_ll, struct csr_mat *out_sl, struct csr_mat *out_ss,
680  buffer *buf
681 )
682 {
683  uint k,n;
684  struct array mat_ll, mat_sl, mat_ss;
685  struct yale_mat *mll, *msl, *mss;
686  array_init(struct yale_mat,&mat_ll,2*nz), mll=mat_ll.ptr;
687  array_init(struct yale_mat,&mat_sl,2*nz), msl=mat_sl.ptr;
688  array_init(struct yale_mat,&mat_ss,2*nz), mss=mat_ss.ptr;
689  for(k=0;k<nz;++k) {
690  sint i=perm[Ai[k]], j=perm[Aj[k]];
691  if(i<0 || j<0 || A[k]==0) continue;
692  if((uint)i<ln) {
693  if((uint)j<ln)
694  n=mat_ll.n++,mll[n].i=i,mll[n].j=j,mll[n].v=A[k];
695  } else {
696  if((uint)j<ln)
697  n=mat_sl.n++,msl[n].i=i-ln,msl[n].j=j,msl[n].v=A[k];
698  else
699  n=mat_ss.n++,mss[n].i=i-ln,mss[n].j=j-ln,mss[n].v=A[k];
700  }
701  }
702  condense_matrix(&mat_ll,ln,out_ll,buf);
703  condense_matrix(&mat_sl,sn,out_sl,buf);
704  condense_matrix(&mat_ss,sn,out_ss,buf);
705  array_free(&mat_ll);
706  array_free(&mat_sl);
707  array_free(&mat_ss);
708 }
709 
710 struct xxt *crs_setup(
711  uint n, const ulong *id,
712  uint nz, const uint *Ai, const uint *Aj, const double *A,
713  uint null_space, const struct comm *comm)
714 {
715  struct xxt *data = tmalloc(struct xxt,1);
716  sint *perm_x2c;
717  struct array dofa;
718  struct csr_mat A_ll, A_ss;
719  buffer buf;
720 
721  comm_dup(&data->comm,comm);
722 
723  locate_proc(data);
724 
725  data->null_space=null_space;
726 
727  buffer_init(&buf,1024);
728 
729  discover_dofs(data,n,id,&dofa,&buf,&data->comm);
730  discover_sep_sizes(data,&dofa,&buf);
731 
732  perm_x2c = discover_sep_ids(data,&dofa,&buf);
733  if(data->null_space) {
734  uint i; double count = 0; struct dof *dof = dofa.ptr;
735  for(i=0;i<data->cn;++i) count+=1/(double)dof[i].count;
736  count=1/sum(data,count,data->xn,0);
737  data->share_weight=tmalloc(double,data->cn);
738  for(i=0;i<data->cn;++i)
739  data->share_weight[i]=count/dof[i].count;
740  }
741  array_free(&dofa);
742 
743  if(!data->null_space || data->xn!=0) {
744  separate_matrix(nz,Ai,Aj,A,data->perm_u2c,
745  data->ln,data->sn,
746  &A_ll,&data->A_sl,&A_ss,
747  &buf);
748  } else {
749  separate_matrix(nz,Ai,Aj,A,data->perm_u2c,
750  data->ln-1,1,
751  &A_ll,&data->A_sl,&A_ss,
752  &buf);
753  }
754 
755  sparse_cholesky_factor(A_ll.n,A_ll.Arp,A_ll.Aj,A_ll.A,
756  &data->fac_A_ll, &buf);
757  free(A_ll.Arp); free(A_ll.A);
758 
759  data->vl = tmalloc(double,data->ln+data->cn+2*data->xn);
760  data->vc = data->vl+data->ln;
761  data->vx = data->vc+data->cn;
762  data->combuf = data->vx+data->xn;
763 
764  orthogonalize(data,&A_ss,perm_x2c,&buf);
765  free(A_ss.Arp); free(A_ss.A);
766  free(perm_x2c);
767  buffer_free(&buf);
768 
769  return data;
770 }
771 
772 void crs_solve(double *x, struct xxt *data, const double *b)
773 {
774  uint cn=data->cn, un=data->un, ln=data->ln, sn=data->sn, xn=data->xn;
775  double *vl=data->vl, *vc=data->vc, *vx=data->vx;
776  uint i;
777  for(i=0;i<cn;++i) vc[i]=0;
778  for(i=0;i<un;++i) {
779  sint p=data->perm_u2c[i];
780  if(p>=0) vc[p]+=b[i];
781  }
782  if(xn>0 && (!data->null_space || xn>1)) {
783  if(data->null_space) --xn;
784  sparse_cholesky_solve(vc,&data->fac_A_ll,vc);
785  apply_m_Asl(vc+ln,sn, data, vc);
786  apply_Xt(vx,xn, data, vc+ln);
787  apply_QQt(data,vx,xn,0);
788  apply_X(vc+ln,sn, data, vx,xn);
789  for(i=0;i<ln;++i) vl[i]=0;
790  apply_p_Als(vl, data, vc+ln,sn);
791  sparse_cholesky_solve(vl,&data->fac_A_ll,vl);
792  for(i=0;i<ln;++i) vc[i]-=vl[i];
793  } else {
794  sparse_cholesky_solve(vc,&data->fac_A_ll,vc);
795  if(data->null_space) {
796  if(xn==0) vc[ln-1]=0;
797  else if(sn==1) vc[ln]=0;
798  }
799  }
800  if(data->null_space) {
801  double s=0;
802  for(i=0;i<cn;++i) s+=data->share_weight[i]*vc[i];
803  s = sum(data,s,data->xn,0);
804  for(i=0;i<cn;++i) vc[i]-=s;
805  }
806  for(i=0;i<un;++i) {
807  sint p=data->perm_u2c[i];
808  x[i] = p>=0 ? vc[p] : 0;
809  }
810 }
811 
812 void crs_stats(struct xxt *data)
813 {
814  int a,b; uint xcol;
815  if(data->comm.id==0) {
816  unsigned s;
817  printf("xxt: separator sizes on %d =",(int)data->comm.id);
818  for(s=0;s<data->nsep;++s) printf(" %d",(int)data->sep_size[s]);
819  printf("\n");
820  printf("xxt: shared dofs on %d = %d\n",(int)data->comm.id,(int)data->sn);
821  }
822  a=data->ln;
823  comm_allreduce(&data->comm,gs_int,gs_max, &a,1, &b);
824  if(data->comm.id==0) printf("xxt: max non-shared dofs = %d\n",a);
825  a=data->sn;
826  comm_allreduce(&data->comm,gs_int,gs_max, &a,1, &b);
827  if(data->comm.id==0) printf("xxt: max shared dofs = %d\n",a);
828  xcol=data->xn; if(xcol&&data->null_space) --xcol;
829  a=xcol;
830  comm_allreduce(&data->comm,gs_int,gs_max, &a,1, &b);
831  if(data->comm.id==0) printf("xxt: max X cols = %d\n",a);
832  a=data->Xp[xcol]*sizeof(double);
833  comm_allreduce(&data->comm,gs_int,gs_max, &a,1, &b);
834  if(data->comm.id==0) printf("xxt: max X size = %d bytes\n",a);
835 }
836 
837 void crs_free(struct xxt *data)
838 {
839  comm_free(&data->comm);
840  free(data->pother);
841  free(data->req);
842  free(data->sep_size);
843  free(data->perm_u2c);
844  if(data->null_space) free(data->share_weight);
846  free(data->A_sl.Arp); free(data->A_sl.A);
847  free(data->Xp); free(data->X);
848  free(data->vl);
849  free(data);
850 }
851 
#define slong
Definition: types.h:74
struct sparse_cholesky fac_A_ll
Definition: xxt.c:103
#define uint
Definition: types.h:70
uint j
Definition: xxt.c:644
static double sum(struct xxt *data, double v, uint n, uint tag)
Definition: xxt.c:400
size_t n
Definition: mem.h:111
#define buffer_free(b)
Definition: mem.h:158
#define tmalloc(type, count)
Definition: mem.h:91
#define sint
Definition: types.h:69
uint un
Definition: xxt.c:86
X
Definition: xxt_test.m:69
static void comm_recv(const struct comm *c, void *p, size_t n, uint src, int tag)
Definition: comm.h:199
double * vl
Definition: xxt.c:108
static void locate_proc(struct xxt *data)
Definition: xxt.c:148
#define sarray_sort_2(T, A, n, field1, is_long1, field2, is_long2, buf)
Definition: sarray_sort.h:60
uint n
Definition: xxt.c:55
static unsigned lg(uint v)
Definition: xxt.c:37
n
Definition: xxt_test.m:73
static void comm_wait(comm_req *req, int n)
Definition: comm.h:236
uint * Aj
Definition: xxt.c:55
static void init_sep_ids(struct xxt *data, struct array *dofa, ulong *xid)
Definition: xxt.c:271
double v
Definition: xxt.c:644
uint * Arp
Definition: xxt.c:55
uint cn
Definition: xxt.c:93
static void discover_dofs(struct xxt *data, uint n, const ulong *id, struct array *dofa, buffer *buf, const struct comm *comm)
Definition: xxt.c:463
static void separate_matrix(uint nz, const uint *Ai, const uint *Aj, const double *A, const sint *perm, uint ln, uint sn, struct csr_mat *out_ll, struct csr_mat *out_sl, struct csr_mat *out_ss, buffer *buf)
Definition: xxt.c:676
void comm_allreduce(const struct comm *com, gs_dom dom, gs_op op, void *v, uint vn, void *buf)
Definition: comm.c:107
#define gs
Definition: gs.c:26
#define x
static void apply_S_col(double *vs, struct xxt *data, struct csr_mat *A_ss, uint ei, double *vl)
Definition: xxt.c:528
static void apply_Xt(double *vx, uint nx, const struct xxt *data, const double *vs)
Definition: xxt.c:572
#define gs_sint
Definition: gs_defs.h:65
static void comm_send(const struct comm *c, void *p, size_t n, uint dst, int tag)
Definition: comm.h:212
ulong A[NUM][SI]
Definition: sort_test.c:17
static void apply_m_Asl(double *vs, uint ns, struct xxt *data, const double *vl)
Definition: xxt.c:516
Definition: comm.h:85
uint count
Definition: xxt.c:179
#define gs_setup
Definition: gs.c:29
ns
Definition: xxt_test.m:43
uint * Xp
Definition: xxt.c:105
static void comm_free(struct comm *c)
Definition: comm.h:176
comm_req * req
Definition: xxt.c:71
#define gs_free
Definition: gs.c:30
unsigned null_space
Definition: xxt.c:81
int comm_req
Definition: comm.h:70
static void apply_QQt(struct xxt *data, double *v, uint n, uint tag)
Definition: xxt.c:360
struct comm comm
Definition: xxt.c:62
#define crs_solve
Definition: xxt.c:22
double * X
Definition: xxt.c:105
#define tensor_dot
Definition: tensor.c:7
static void orthogonalize(struct xxt *data, struct csr_mat *A_ss, sint *perm_x2c, buffer *buf)
Definition: xxt.c:607
#define crs_free
Definition: xxt.c:24
#define sarray_permute_buf(T, A, n, buf)
Definition: sarray_sort.h:45
double * vc
Definition: xxt.c:108
#define buffer_reserve(b, max)
Definition: mem.h:157
static void apply_S(double *Svs, uint ns, struct xxt *data, struct csr_mat *A_ss, const double *vs, double *vl)
Definition: xxt.c:548
const uint Ai[3][32]
Definition: xxt_test.c:80
p
Definition: xxt_test2.m:1
static void apply_p_Als(double *vl, struct xxt *data, const double *vs, uint ns)
Definition: xxt.c:504
uint np
Definition: comm.h:86
uint ln
Definition: xxt.c:96
Definition: mem.h:111
unsigned plevels
Definition: xxt.c:64
#define crs_stats
Definition: xxt.c:23
#define ulong
Definition: types.h:75
#define sparse_cholesky_solve
uint pcoord
Definition: xxt.c:63
#define crs_setup
Definition: xxt.c:21
Definition: xxt.c:54
static void allocate_X(struct xxt *data, sint *perm_x2c)
Definition: xxt.c:593
#define array_init(T, a, max)
Definition: mem.h:136
static void apply_X(double *vs, uint ns, const struct xxt *data, const double *vx, uint nx)
Definition: xxt.c:580
sint * pother
Definition: xxt.c:65
Ass
Definition: xxt_test.m:41
uint i
Definition: xxt.c:644
uint * sep_size
Definition: xxt.c:76
static void find_perm_x2c(uint ln, uint cn, const struct array *dofc, uint xn, const ulong *xid, sint *perm)
Definition: xxt.c:296
#define CHECK(i)
static ulong * unique_nonzero(ulong *A, ulong *Aend)
Definition: xxt.c:236
#define sortp_long
Definition: sort.h:57
uint * sortp(buffer *restrict buf, int start_perm, const T *restrict A, uint n, unsigned stride)
Definition: sort_imp.h:477
static void condense_matrix(struct array *mat, uint nr, struct csr_mat *out, buffer *buf)
Definition: xxt.c:647
for i
Definition: xxt_test.m:74
struct csr_mat A_sl
Definition: xxt.c:104
ulong id
Definition: xxt.c:179
unsigned nsep
Definition: xxt.c:75
void * ptr
Definition: mem.h:111
const uint Aj[3][32]
Definition: xxt_test.c:85
#define sparse_cholesky_free
#define buffer_init(b, max)
Definition: mem.h:155
Definition: gs.c:1039
uint level
Definition: xxt.c:179
static const unsigned nr[3]
#define array_free(a)
Definition: mem.h:135
Gather/Scatter Library interface.
uint id
Definition: comm.h:86
#define comm_dup(d, s)
Definition: comm.h:174
static void merge_sep_ids(struct xxt *data, ulong *sep_id, ulong *other, ulong *work, unsigned s0, buffer *buf)
Definition: xxt.c:253
ulong out[N]
Definition: sort_test2.c:20
static double work[TNR *NS]
#define sparse_cholesky_factor
establishes some macros to establish naming conventions
sint * perm_u2c
Definition: xxt.c:94
double * vx
Definition: xxt.c:108
Definition: xxt.c:58
double * share_weight
Definition: xxt.c:82
static void discover_sep_sizes(struct xxt *data, struct array *dofa, buffer *buf)
Definition: xxt.c:185
uint sn
Definition: xxt.c:96
static uint unique_ids(uint n, const ulong *id, sint *perm, buffer *buf)
Definition: xxt.c:441
const uint nz[3]
Definition: xxt_test.c:78
double * A
Definition: xxt.c:55
Definition: xxt.c:179
static void comm_isend(comm_req *req, const struct comm *c, void *p, size_t n, uint dst, int tag)
Definition: comm.h:228
Definition: xxt.c:644
static sint * discover_sep_ids(struct xxt *data, struct array *dofa, buffer *buf)
Definition: xxt.c:311
const uint nx[3]
Definition: xxt_test.c:41
#define sortv_long
Definition: sort.h:56
uint xn
Definition: xxt.c:100
double * combuf
Definition: xxt.c:108