16 #define findpts_el_setup_2 PREFIXED_NAME(findpts_el_setup_2)
17 #define findpts_el_free_2 PREFIXED_NAME(findpts_el_free_2 )
18 #define findpts_el_2 PREFIXED_NAME(findpts_el_2 )
19 #define findpts_el_eval_2 PREFIXED_NAME(findpts_el_eval_2 )
24 #define DIAGNOSTICS_ITERATIONS 0
26 #if defined(DIAGNOSTICS_1) || defined(DIAGNOSTICS_2) \
27 || DIAGNOSTICS_ITERATIONS > 0
34 const double idet = 1/(A[0]*A[3] - A[1]*A[2]);
35 x[0] = idet*(A[3]*y[0] - A[1]*y[1]);
36 x[1] = idet*(A[0]*y[1] - A[2]*y[0]);
52 #define CONVERGED_FLAG (1u<<4)
53 #define FLAG_MASK 0x1fu
57 const unsigned y = flags | flags>>1;
58 return (y&1u) + (y>>2 & 1u);
63 return (flags>>2 & 3u)*3 + (flags & 3u);
70 const unsigned mask = 0u - (flags>>4);
80 const unsigned y = x&7u;
81 return (y-(y>>2)) | ((x-1)&4u);
88 return ((x>>1)&1u) | ((x>>2)&2u);
129 const unsigned nr,
const unsigned ns,
const unsigned npt_max)
131 const unsigned n = ns>nr?ns:
nr;
133 #define DO_MAX(x) do { const unsigned temp=(x); \
134 wsize=temp>wsize?temp:wsize; } while(0)
135 wsize = (6 + 2*(2*nr+
ns)) * npt_max;
145 const unsigned npt_max)
147 const unsigned nr=n[0],
ns=n[1];
148 const unsigned tot = 8*ns + 4*
nr;
149 unsigned d,
i, lag_size[2];
157 fd->
z[0] =
tmalloc(
double,lag_size[0]+lag_size[1]
160 fd->
z[1] = fd->
z[0]+
nr;
171 double *wt=fd->
wtend[d];
unsigned n=fd->
n[d];
177 wt[0]=1;
for(i=1;i<
n;++
i) wt[i]=0;
178 wt+=3*
n; {
for(i=0;i<n-1;++
i) wt[i]=0; } wt[
i]=1;
184 fd->edge[1].x[d] = fd->
sides + (4+d)*ns, \
188 fd->
edge[2].
x[d] = 0,
190 fd->
edge[3].
x[d] = 0,
205 const unsigned nr = fd->
n[0],
ns=fd->
n[1], nrsm1 = nr*(
ns-1);
208 memcpy(work , fd->
wtend[1]+
ns,
ns*
sizeof(
double));
209 memcpy(work+
ns, fd->
wtend[1]+4*
ns,
ns*
sizeof(
double));
212 memcpy(
out+ d *nr, work+2*
ns , nr*
sizeof(
double));
213 memcpy(
out+(2+d)*nr, work+2*
ns+nr , nr*
sizeof(
double));
214 fd->
edge[2].
x[d] = fd->
x[d];
215 fd->
edge[3].
x[d] = fd->
x[d] + nrsm1;
222 const unsigned nr = fd->
n[0],
ns=fd->
n[1];
225 memcpy(work , fd->
wtend[0] , 2*nr*
sizeof(
double));
226 memcpy(work+2*nr, fd->
wtend[0]+3*nr, 2*nr*
sizeof(
double));
229 memcpy(
out+ d *
ns, work+4*nr , ns*
sizeof(
double));
230 memcpy(
out+(2+d)*ns, work+4*nr+ ns, ns*
sizeof(
double));
231 memcpy(
out+(4+d)*ns, work+4*nr+2*ns, ns*
sizeof(
double));
232 memcpy(
out+(6+d)*ns, work+4*nr+3*ns, ns*
sizeof(
double));
239 const unsigned mask = 1u<<(ei/2);
248 return &fd->
edge[ei];
254 const unsigned nr = fd->
n[0],
ns = fd->
n[1];
260 for(j=0;j<2;++j)
for(i=0;i<2;++
i) {
261 fd->
pt[2*j+
i].
x[d] = work2[6*(3*j+0)+(3*i+0)];
262 fd->
pt[2*j+
i].
jac[2*d+0] = work2[6*(3*j+0)+(3*i+1)];
263 fd->
pt[2*j+
i].
jac[2*d+1] = work2[6*(3*j+1)+(3*i+0)];
264 fd->
pt[2*j+
i].
hes[2*d+0] = work2[6*(3*j+0)+(3*i+2)];
265 fd->
pt[2*j+
i].
hes[2*d+1] = work2[6*(3*j+2)+(3*i+0)];
284 const double resid[2],
288 const double old_dist2 = p->
dist2;
289 const double dist2 = resid[0]*resid[0]+resid[1]*resid[1];
290 const double decr = old_dist2-dist2;
291 const double pred = p->
dist2p;
292 out->
x[0]=p->
x[0],out->
x[1]=p->
x[1];
297 printf(
"Checking prior step:\n"
298 " old r = (%.17g,%.17g), old flags = %x\n"
299 " old_dist2 = %.17g\n"
300 " r = (%.17g,%.17g), flags = %x\n"
302 " difference = %.17g\n"
303 " predicted = %.17g\n"
307 decr, pred, decr/pred);
309 if(decr>= 0.01 * pred) {
310 if(decr>= 0.9 * pred) {
313 printf(
" very good iteration; tr -> %g\n", out->
tr);
317 printf(
" good iteration; tr = %g\n", p->
tr);
327 double v0 = fabs(p->
r[0]-p->
oldr[0]),
328 v1 = fabs(p->
r[1]-p->
oldr[1]);
329 out->
tr = (v0>v1?v0:v1)/4;
331 printf(
" bad iteration; tr -> %g\n", out->
tr);
333 out->
dist2=old_dist2;
345 const double jac[4],
const double resid[2],
348 const double tr = p->
tr;
349 double bnd[4] = { -1,1, -1,1 };
352 unsigned d, mask, flags;
354 r0[0] = p->
r[0], r0[1] = p->
r[1];
357 printf(
"newton_area:\n");
358 printf(
" resid = (%g,%g); r^T r / 2 = %g\n",resid[0],resid[1],
359 (resid[0]*resid[0]+resid[1]*resid[1])/2);
360 printf(
" jac = %g\t%g\n"
362 jac[0],jac[1],jac[2],jac[3]);
363 printf(
" r = (%.17g,%.17g)\n",r0[0],r0[1]);
368 if(r0[d]-tr>-1) bnd[2*d ]=r0[d]-tr, mask^=1u<<(2*d);
369 if(r0[d]+tr< 1) bnd[2*d+1]=r0[d]+tr, mask^=2u<<(2*d);
375 printf(
" min at r = (%.17g,%.17g)\n", r0[0]+dr[0],r0[1]+dr[1]);
380 double nr = r0[d]+dr[d];
381 if((nr-bnd[2*d])*(bnd[2*d+1]-nr)>=0)
continue;
383 double f = (bnd[2*d ]-r0[d])/dr[d];
384 if(f<fac) fac=f, flags = 1u<<(2*d);
386 double f = (bnd[2*d+1]-r0[d])/dr[d];
387 if(f<fac) fac=f, flags = 2u<<(2*d);
392 printf(
" flags = %x, fac = %.15g\n",flags,fac);
395 if(flags==0)
goto newton_area_fin;
397 for(d=0;d<2;++d) dr[d]*=fac;
402 const double res0 = resid[0]-(jac[0]*dr[0]+jac[1]*dr[1]),
403 res1 = resid[1]-(jac[2]*dr[0]+jac[3]*dr[1]);
405 const double y = jac[de]*res0+jac[2+de]*res1;
407 const double JtJ = jac[ de]*jac[ de]
408 +jac[2+de]*jac[2+de];
409 const double drc = y/JtJ;
411 unsigned new_flags = 0;
413 printf(
" edge %u, de=%u\n",ei,de);
414 printf(
" r=(%.17g,%.17g)\n", r0[0]+dr[0],r0[1]+dr[1]);
415 printf(
" resid = (%g,%g); r^T r / 2 = %g\n",res0,res1,
416 (res[0]*res[0]+res[1]*res[1])/2);
417 printf(
" min at %.17g\n", r0[de]+dr[de]+drc);
420 const double rz = r0[de]+dr[de], lb=bnd[2*de],ub=bnd[2*de+1];
421 const double nr = r0[de]+(dr[de]+drc);
422 if((nr-lb)*(ub-
nr)<0) {
424 double f = (lb-rz)/drc;
425 if(f<fac) fac=f, new_flags = 1u<<(2*de);
427 double f = (ub-rz)/drc;
428 if(f<fac) fac=f, new_flags = 2u<<(2*de);
433 printf(
" new_flags = %x, fac = %.17g\n",new_flags,fac);
437 goto newton_area_relax;
442 const unsigned old_flags = flags;
444 const double res0 = resid[0]-(jac[0]*dr[0]+jac[1]*dr[1]),
445 res1 = resid[1]-(jac[2]*dr[0]+jac[3]*dr[1]);
447 double y[2]; y[0] = jac[0]*res0+jac[2]*res1,
448 y[1] = jac[1]*res0+jac[3]*res1;
449 #define SETDR(d) do { \
450 unsigned f = flags>>(2*d) & 3u; \
451 if(f) dr[d] = bnd[2*d+(f-1)] - r0[d]; \
456 unsigned c = flags>>(2*d) & 3u;
458 else if(dr[d]*y[d]<0) flags &= ~(3u<<(2*d));
460 if( (c==1&&dr[d]>0) || (c==2&&dr[d]<0) )
461 printf(
"FAIL! c=%u, dr[d]=%g\n",c,dr[d]);
465 printf(
" checking constraints (%x)\n",old_flags);
466 printf(
" r=(%.17g,%.17g)\n", r0[0]+dr[0],r0[1]+dr[1]);
467 printf(
" resid = (%g,%g); r^T r / 2 = %g\n",res[0],res[1],
468 (res[0]*res[0]+res[1]*res[1])/2);
469 printf(
" relaxed %x -> %x\n",old_flags,flags);
471 if(flags==old_flags)
goto newton_area_fin;
473 case 1:
goto newton_area_edge;
480 const double res[2]={ resid[0]-(jac[0]*dr[0]+jac[1]*dr[1]),
481 resid[1]-(jac[2]*dr[0]+jac[3]*dr[1]) };
482 printf(
" r=(%.17g,%.17g)\n", r0[0]+dr[0],r0[1]+dr[1]);
483 printf(
" resid = (%g,%g); r^T r / 2 = %g\n",res[0],res[1],
484 (res[0]*res[0]+res[1]*res[1])/2);
490 const double res0 = resid[0]-(jac[0]*dr[0]+jac[1]*dr[1]),
491 res1 = resid[1]-(jac[2]*dr[0]+jac[3]*dr[1]);
492 out->
dist2p=resid[0]*resid[0]+resid[1]*resid[1]
493 -(res0*res0+res1*res1);
495 #define SETR(d) do { \
496 unsigned f = flags>>(2*d) & 3u; \
497 out->r[d] = f==0 ? r0[d]+dr[d] : ( f==1 ? -1 : 1 ); \
505 const double jac[4],
const double rhes,
const double resid[2],
506 const unsigned de,
const unsigned dn,
510 const double tr = p->
tr;
512 const double A = jac[ de]*jac[ de]
513 +jac[2+de]*jac[2+de] - rhes;
515 const double y = jac[ de]*resid[0]
518 const double oldr = p->
r[de];
519 double dr,
nr,tdr,tnr;
520 double v,tv;
unsigned new_flags=0, tnew_flags=0;
523 printf(
"Newton edge %u (dn=%u) flags=%x\n",de,dn,flags);
524 printf(
" A=%g, y=%g\n",A,y);
525 if(A<=0) printf(
" A not positive\n");
526 printf(
" r=(%.17g,%.17g)\n",p->
r[0],p->
r[1]);
529 #define EVAL(dr) (dr*A-2*y)*dr
533 dr = y/
A, nr = oldr+dr;
534 if(fabs(dr)<tr && fabs(nr)<1) { v=
EVAL(dr);
goto newton_edge_fin; }
537 if(( nr=oldr-tr)>-1) dr=-tr;
538 else nr=-1, dr=-1-oldr, new_flags = flags | 1u<<(2*de);
541 if((tnr=oldr+tr)< 1) tdr=tr;
542 else tnr= 1, tdr= 1-oldr, tnew_flags = flags | 2u<<(2*de);
545 if(tv<v) nr=tnr, dr=tdr, v=tv, new_flags=tnew_flags;
553 out->
flags = flags | new_flags | (p->
flags<<5);
555 printf(
" new r = (%.17g,%.17g)\n",out->
r[0],out->
r[1]);
562 const struct findpts_el_pt_2 *
const p,
const unsigned pn,
const double tol);
570 const unsigned nr=fd->
n[0],
ns=fd->
n[1];
571 double *
const resid = fd->
work, *
const jac = resid + 2*pn,
572 *
const wtr =
jac+4*pn, *
const wts = wtr+2*nr*pn,
573 *
const slice = wts+2*
ns*pn;
574 unsigned i;
unsigned d;
577 fd->
lag[0](wtr+2*i*nr, fd->
lag_data[0], nr, 1, p[i].
r[0]);
583 const double *
const wtr_i = wtr+2*i*
nr, *
const slice_i = slice+2*i*
nr;
584 double *
const jac_i =
jac+4*i+2*d;
587 jac_i[1] =
tensor_i1(wtr_i,nr, slice_i+nr);
606 const unsigned n = fd->
n[de];
607 double *
const resid=fd->
work, *
const jac=resid+2*pn, *
const hes=
jac+4*pn,
608 *
const wt = hes+pn, *
const slice = wt+3*n*pn;
610 unsigned i;
unsigned d;
613 printf(
"Edge %u\n",ei);
614 printf(
" pflag = %u\n",pflag);
615 printf(
" ei = %u\n",ei);
616 printf(
" dn, de = %u, %u\n",dn,de);
617 printf(
" n = %u \n", n);
622 fd->
lag[de](wt+3*i*n, fd->
lag_data[de], n, 2, p[i].
r[de]);
623 for(i=0;i<pn;++
i) hes[i]=0;
627 const double *
const slice_i = slice+3*
i;
629 resid[2*i+d] = r = p[
i].
x[d] - slice_i[0];
630 jac[4*i+2*d+de] = slice_i[1];
631 hes[
i] += r * slice_i[2];
634 for(i=1;i<pn;++
i) memcpy(wt+i*n, wt+3*i*n, n*
sizeof(
double));
637 for(i=0;i<pn;++
i) jac[4*i+2*d+dn] = slice[i];
641 double *
const resid_i=resid+2*
i, *
const jac_i=jac+4*
i, *
const hes_i=hes+
i;
645 const double steep = resid_i[0] * jac_i[ dn]
646 +resid_i[1] * jac_i[2+dn];
648 printf(
"jacobian = %g\t%g\n"
649 " %g\t%g\n",jac_i[0],jac_i[1],jac_i[2],jac_i[3]);
650 printf(
"resid_i = (%g,%g)\n", resid_i[0],resid_i[1]);
651 printf(
"steep = %g (%s)\n", steep, steep * p[i].r[dn] < 0 ?
"in" :
"out");
653 if(steep * p[i].r[dn] < 0)
656 newton_edge(out+i, jac_i, *hes_i, resid_i, de,dn,pflag, p+i, tol);
669 const double *
const x = gpt->
x, *
const jac = gpt->
jac, *
const hes = gpt->
hes;
673 printf(
"Point %u\n",pi);
674 printf(
" pflag = %u\n",pflag);
675 printf(
" pi = %u\n",pi);
679 double resid[2], steep[2], sr[2];
681 resid[0] = p[
i].
x[0]-x[0],
682 resid[1] = p[
i].
x[1]-x[1];
683 steep[0] =
jac[0]*resid[0] +
jac[2]*resid[1],
684 steep[1] =
jac[1]*resid[0] +
jac[3]*resid[1];
685 sr[0] = steep[0]*p[
i].
r[0],
686 sr[1] = steep[1]*p[
i].
r[1];
691 if(sr[1]<0)
goto findpt_pt_area;
692 else { de=0,dn=1;
goto findpt_pt_edge; }
694 else if(sr[1]<0) { de=1,dn=0;
goto findpt_pt_edge; }
695 out[
i].
r[0]=p[
i].
r[0],out[
i].
r[1]=p[
i].
r[1];
703 const double rh = resid[0]*
hes[de]+resid[1]*
hes[2+de];
705 pflag&(3u<<(2*dn)), p+i, tol);
714 const unsigned nr=fd->
n[0],
ns=fd->
n[1];
716 for(p=pt;p!=pe;++
p) p->
dist2=DBL_MAX;
718 const double zs=fd->
z[1][j];
720 const double zr=fd->
z[0][
i];
721 const double x=fd->
x[0][ii],
y=fd->
x[1][ii];
723 for(p=pt;p!=pe;++
p) {
724 const double dx=p->
x[0]-
x,dy=p->
x[1]-
y;
725 const double dist2 = dx*dx+dy*dy;
726 if(p->
dist2<=dist2)
continue;
740 unsigned nconv = npt;
742 unsigned count[9] = { 0,0,0, 0,0,0, 0,0,0 } ;
747 pstart[
i].x[0]=pbuf[
i].
x[0];
748 pstart[
i].x[1]=pbuf[
i].
x[1];
749 pstart[
i].r[0]=pbuf[
i].
r[0];
750 pstart[
i].r[1]=pbuf[
i].
r[1];
751 pstart[
i].index=
i,pstart[
i].flags=0;
752 pstart[
i].dist2=DBL_MAX,pstart[
i].dist2p=0,pstart[
i].tr=1;
755 while(nconv && step++ < 50) {
759 #if DIAGNOSTICS_ITERATIONS>1
761 printf(
"findpts_el_2 Newton step (%u), %u unconverged:\n ", step,nconv);
762 for(i=0;i<9;++
i) printf(
" %u",count[i]);
767 for(p=pstart,pout=pbuf; p!=pe; p+=pn,pout+=pn) {
774 unsigned offset[10] = { 0,0,0, 0,0,0, 0,0,0, 0 };
776 for(pout=pbuf; pout!=pe; ++pout)
779 unsigned i;
unsigned sum=0;
781 unsigned ci=offset[
i]; count[
i]=ci, offset[
i]=
sum, sum+=ci;
783 nconv = offset[9] =
sum;
785 for(pout=pbuf; pout!=pe; ++pout)
790 for(p=pstart;p!=pe;++
p)
793 #if DIAGNOSTICS_ITERATIONS
794 printf(
"findpts_el_2 took %u steps\n ", step);
799 double *
const out_base,
const unsigned out_stride,
800 const double *
const r_base,
const unsigned r_stride,
const unsigned pn,
803 const unsigned nr=fd->
n[0],
ns=fd->
n[1];
804 double *
const wtr = fd->
work, *
const wts = wtr+nr*pn,
805 *
const slice = wts+
ns*pn;
806 unsigned i;
const double *
r;
double *
out;
807 for(i=0,r=r_base;i<pn;++
i) {
810 r = (
const double*)((
const char*)r + r_stride);
814 for(i=0,out=out_base;i<pn;++
i) {
815 const double *
const wtr_i = wtr+i*
nr, *
const slice_i = slice+i*
nr;
817 out = (
double*)((
char*)out + out_stride);
struct findpts_el_gpt_2 pt[4]
static double sum(struct xxt *data, double v, uint n, uint tag)
#define tmalloc(type, count)
#define findpts_el_free_2
void lagrange_fun(double *restrict p, double *restrict data, unsigned n, int d, double x)
static double tensor_ig1(double g[1], const double *wtr, uint nr, const double *u)
void compute_edge_data_fun(struct findpts_el_data_2 *fd)
void findpt_fun(struct findpts_el_pt_2 *const out, struct findpts_el_data_2 *const fd, const struct findpts_el_pt_2 *const p, const unsigned pn, const double tol)
static void tensor_mtxv(double *y, uint ny, const double *A, const double *x, uint nx)
static void lin_solve_2(double x[2], const double A[4], const double y[2])
static const struct findpts_el_gpt_2 * get_pt(struct findpts_el_data_2 *fd, unsigned pi)
static void tensor_mxm(double *C, uint nc, const double *A, uint na, const double *B, uint nb)
static unsigned pt_flags_to_bin_noC(const unsigned flags)
static void newton_area(struct findpts_el_pt_2 *const out, const double jac[4], const double resid[2], const struct findpts_el_pt_2 *const p, const double tol)
static unsigned plus_1_mod_2(const unsigned x)
static unsigned point_index(const unsigned x)
static double tensor_i1(const double *Jr, uint nr, const double *u)
static const struct findpts_el_gedge_2 * get_edge(struct findpts_el_data_2 *fd, unsigned ei)
static unsigned work_size(const unsigned nr, const unsigned ns, const unsigned npt_max)
static unsigned num_constrained(const unsigned flags)
#define findpts_el_setup_2
static void seed(struct findpts_el_data_2 *const fd, struct findpts_el_pt_2 *const pt, const unsigned npt)
static unsigned pt_flags_to_bin(const unsigned flags)
static void compute_edge_data_s(struct findpts_el_data_2 *fd)
static void newton_edge(struct findpts_el_pt_2 *const out, const double jac[4], const double rhes, const double resid[2], const unsigned de, const unsigned dn, unsigned flags, const struct findpts_el_pt_2 *const p, const double tol)
static unsigned edge_index(const unsigned x)
static void compute_pt_data(struct findpts_el_data_2 *fd)
static void findpt_pt(struct findpts_el_pt_2 *const out, struct findpts_el_data_2 *const fd, const struct findpts_el_pt_2 *const p, const unsigned pn, const double tol)
static unsigned which_bit(const unsigned x)
struct findpts_el_pt_2 * p
static void findpt_edge(struct findpts_el_pt_2 *const out, struct findpts_el_data_2 *const fd, const struct findpts_el_pt_2 *const p, const unsigned pn, const double tol)
static int reject_prior_step_q(struct findpts_el_pt_2 *const out, const double resid[2], const struct findpts_el_pt_2 *const p, const double tol)
static const unsigned nr[3]
#define findpts_el_eval_2
static double work[TNR *NS]
establishes some macros to establish naming conventions
static void compute_edge_data_r(struct findpts_el_data_2 *fd)
static void findpt_area(struct findpts_el_pt_2 *const out, struct findpts_el_data_2 *const fd, const struct findpts_el_pt_2 *const p, const unsigned pn, const double tol)
static double y[NR *NS *NT *N]
struct findpts_el_gedge_2 edge[4]