#ifndef SBCLIB_SVD
// Singular-value decomposition of a possibly-rectangular matrix A = UDV where U,V are orthogonal
// Would be faster with shifting/"Wilkinson's Shift" but that requires a more explicit version of bidiagonal_sweep
#include <squarematrix.c>
#include <rnd/sq.c>

void hhmulleft(REAL **a,const REAL *u,const int start=0) // Optionally "start" is u's first nonzero value
{ // Replaces A by (I-2uu^t/|u|^2)A = A - u(2u^tA)/|u|^2, O(N^2 or XY)
	int x,y,X=array_2d_xs(a),Y=array_2d_ys(a); REAL t,uu=0;
	for (x=X-1;x>=start;x--) uu+=u[x]*u[x]; if (uu==0) return;
	for (y=Y-1;y>=0;y--)
	{
		t=0;
		for (x=X-1;x>=start;x--) t+=u[x]*a[x][y];
		t*=2.0/uu;
		for (x=X-1;x>=start;x--) a[x][y]-=u[x]*t;
	}
}

void hhmulright(REAL **a,const REAL *v,const int start=0) // Optionally "start" is v's first nonzero value
{ // Replaces A by A(I-2vv^t/|v|^2) = A - (2Av)v^t/|v|^2, O(N^2 or XY)
	int x,y,X=array_2d_xs(a),Y=array_2d_ys(a); REAL t,vv=0;
	for (y=Y-1;y>=start;y--) vv+=v[y]*v[y]; if (vv==0) return;
	for (x=X-1;x>=0;x--)
	{
		t=0;
		for (y=Y-1;y>=start;y--) t+=a[x][y]*v[y];
		t*=2.0/vv;
		for (y=Y-1;y>=start;y--) a[x][y]-=t*v[y];
	}
}

void hhstepleft(REAL **a,const int sx,const int sy,REAL ** const b=NULL)
{
	int x,X=array_2d_xs(a);
	if (sx>=X) return; // You need to check sy<Y when calling though
	REAL d=0,x1=a[sx][sy];
	for (x=sx;x<X;x++) d+=sq(a[x][sy]);
	d=sqrt(d);
	if (x1==d) return; // Already pointing in the right direction, nothing to do!
	REAL *h=array_1d(X);
	for (x=0;x<sx;x++) h[x]=0;
	h[sx]=x1-d;
	for (x=sx+1;x<X;x++) h[x]=a[x][sy];
	hhmulleft(a,h,sx);
	if (b) hhmulright(b,h,sx);
	array_1d_free(h);
}

void hhstepright(REAL **a,const int sx,const int sy,REAL ** const b=NULL)
{
	int y,Y=array_2d_ys(a);
	if (sy>=Y) return; // You need to check sx<X when calling though
	REAL d=0,x1=a[sx][sy];
	for (y=sy;y<Y;y++) d+=sq(a[sx][y]);
	d=sqrt(d);
	if (x1==d) return;
	REAL *h=array_1d(Y);
	for (y=0;y<sy;y++) h[y]=0;
	h[sy]=x1-d;
	for (y=sy+1;y<Y;y++) h[y]=a[sx][y];
	hhmulright(a,h,sy);
	if (b) hhmulleft(b,h,sy);
	array_1d_free(h);
}

int compactsvd=1; // This might as well be on in almost all applications

void bidiagonalise(REAL **a,REAL ***pu,REAL ***pv)
{ // O(N^3)
	int X=array_2d_xs(a),Y=array_2d_ys(a),M=Mini(X,Y);
	if (compactsvd)
	{ // Only create the smaller orthogonal matrix
		if (X<Y) {*pu=matrixid(X); *pv=NULL;} // Actually linked to logic in bidiagonal_sweep
		else {*pu=NULL; *pv=matrixid(Y);}
	}
	else {*pu=matrixid(X); *pv=matrixid(Y);}
	for (int n=0;n<M;n++)
	{
		hhstepleft(a,n,n,*pu);
		hhstepright(a,n,n+1,*pv);
	}
}

void givensrot(const REAL a,const REAL b,REAL *pc,REAL *ps,REAL *pr)
{ // Demmel & Kahan algorithm for a matrix [c s][-s c] that rotates (a,b) to (r,0)
	if (a==0) {*pc=0; *ps=1; *pr=b; return;}
	REAL t,t1;
	if (fabs(a)>=fabs(b))
	{
		t=b/a; t1=sqrt(t*t+1);
		*pc=1.0/t1; *ps=t/t1; *pr=a*t1;
	}
	else
	{
		t=a/b; t1=sqrt(t*t+1);
		*ps=1.0/t1; *pc=t/t1; *pr=b*t1;
	}
}

void givensmulleft(REAL **a,const int i,const int j,const REAL cth,const REAL sth)
{ // A -> GA where G is [cth -sth][sth cth] in rows/cols i,j
	int n,Y=array_2d_ys(a); REAL ain;
	for (n=Y-1;n>=0;n--)
	{
		ain=a[i][n];
		a[i][n]=cth*ain-sth*a[j][n];
		a[j][n]=sth*ain+cth*a[j][n];
	}
}

void givensmulright(REAL **a,const int i,const int j,const REAL cth,const REAL sth)
{ // A -> AG where G is [cth -sth][sth cth] in rows/cols i,j
	int n,X=array_2d_xs(a); REAL ani;
	for (n=X-1;n>=0;n--)
	{
		ani=a[n][i];
		a[n][i]=ani*cth+a[n][j]*sth;
		a[n][j]=ani*-sth+a[n][j]*cth;
	}
}

void bidiagonal_sweep(REAL *d,REAL *e,REAL **u,REAL **v)
{ // O(N^2) step
	int n,m=array_1d_m(d);
	REAL os,oc=1,c=1,s,r;
	for (n=0;n+1<m;n++)
	{
		givensrot(c*d[n],e[n],&c,&s,&r);
		if (v) givensmulleft(v,n,n+1,c,-s);
		if (n>0) e[n-1]=r*os;
		givensrot(oc*r,d[n+1]*s,&oc,&os,&d[n]);
		if (u) givensmulright(u,n,n+1,oc,os);
	}
	if (compactsvd && u || u && v && array_2d_xs(v)>array_2d_xs(u))
	{
		givensrot(c*d[m-1],e[m-1],&c,&s,&r);
		if (v) givensmulleft(v,m-1,m,c,-s);
		e[m-2]=r*os;
		d[m-1]=r*oc;
		e[m-1]=0;
	}
	else
	{
		e[m-2]=c*d[m-1]*os;
		d[m-1]=c*d[m-1]*oc;
	}
}

void bidiagonal_vectors(const REAL * const *a,REAL **pd,REAL **pe)
{ // Extracts the diagonal and superdiagonal as vectors
	int n,m=Mini(array_2d_xs(a),array_2d_ys(a));
	*pd=array_1d(m); *pe=array_1d(m);
	for (n=m-1;n>=0;n--) (*pd)[n]=a[n][n];
	for (n=m-2;n>=0;n--) (*pe)[n]=a[n][n+1];
	if (m<array_2d_ys(a)) (*pe)[m-1]=a[m-1][m];
	else (*pe)[m-1]=0;
}
//#include <profile1.c>
REAL *svd(const REAL * const *a0,REAL ***pu,REAL ***pv)
{ // Returns the "diagonal" (vector of singular values), also constructs U, V matrices
	REAL **a=array_2d_copy(a0);
//profile("svd - bidiagonalise");
	bidiagonalise(a,pu,pv);
	REAL *ret,*e;
//profile("svd - bidiagonal_vectors");
	bidiagonal_vectors(a,&ret,&e);
	array_2d_free(a);
//profile("svd - bidiagonal_sweep iterations");
	int i; for (i=0;array_1d_supnorm(e)>1e-15 && i<1000;i++) bidiagonal_sweep(ret,e,*pu,*pv);
	array_1d_free(e);
	if (compactsvd)
	{ // Find part of the larger orthogonal matrix implicity from the rest
//profile("svd - compactsvd stuff");
		int n,i,j,X=array_2d_xs(a0),Y=array_2d_ys(a0); REAL t;
		if (!*pu) // A=USV => U=AV^tS^-1
		{
			*pu=array_2d(X,Y);
			for (i=X-1;i>=0;i--) for (j=Y-1;j>=0;j--) if (ret[j]!=0)
			{
				t=0;
				for (n=Y-1;n>=0;n--) t+=a0[i][n]*(*pv)[j][n];
				(*pu)[i][j]=t/ret[j];
			} else (*pu)[i][j]=0;
		}
		else if (!*pv) // A=USV => V=S^-1U^tA
		{
			*pv=array_2d(X,Y);
			for (i=X-1;i>=0;i--) for (j=Y-1;j>=0;j--) if (ret[i]!=0)
			{
				t=0;
				for (n=X-1;n>=0;n--) t+=(*pu)[n][i]*a0[n][j];
				(*pv)[i][j]=t/ret[i];
			} else (*pv)[i][j]=0;
		}
	}
	return ret;
}

void svdfree(REAL *s,REAL **u,REAL **v) {array_1d_free(s); array_2d_free(u); array_2d_free(v);}

REAL *svdsym(const REAL * const *a0,REAL ***pu,REAL ***pv)
{ // Makes U and V matrices be transposes of each other and fixes up signs in eigenvalues (of real symmetric a0)
	REAL *ret=svd(a0,pu,pv);
	int n,i,m=array_1d_m(ret);
	REAL **u=*pu,**v=*pv,t;
	for (n=m-1;n>=0;n--)
	{
		t=0;
		for (i=m-1;i>=0;i--) t+=u[i][n]*v[n][i];
		if (t<0) // Different signs of U column and V row
		{
			ret[n]*=-1;
			for (i=m-1;i>=0;i--) u[i][n]*=-1;
		}
	}
	return ret;
}

REAL *svdsolve_inner(const REAL *s,const REAL * const *u,const REAL * const *v,const REAL *b,const REAL lambda=0)
{ // If you've already calculated the SVD, use this O(N^2) routine
	REAL ll=lambda*lambda,*utb=matrixtmulvector(u,b);
	int n,m=array_2d_xs(v),um=array_1d_m(utb); REAL *isutb=array_1d(m);
	for (n=m-1;n>=0;n--)
		if (n>=um) isutb[n]=0;
		else if (ll>0 || s[n]!=0) isutb[n]=s[n]*utb[n]/(s[n]*s[n]+ll);
		else isutb[n]=0;
	array_1d_free(utb);
	REAL *x=matrixtmulvector(v,isutb);
	array_1d_free(isutb);
	return x;
}

REAL *svdsolve(const REAL * const *a,const REAL *b,const REAL lambda=0)
{ // Ax~=b; lambda->0 for exact solution, lambda->infinity for tiny steepest descent step
	REAL **u,**v,*s=svd(a,&u,&v),
		*ret=svdsolve_inner(s,u,v,b,lambda);
	array_1d_free(s); array_2d_free(u); array_2d_free(v);
	return ret;
}

REAL *svdquadmax_inner(const REAL *s,const REAL * const *u,const REAL * const *v,const REAL *g,const REAL lambda=0)
{ // If you've already calculated the SVD (with svdsym), use this O(N^2) routine
	REAL *utg=matrixtmulvector(u,g);
	int n,m=array_1d_m(s); REAL maxeig=0,denom;
	for (n=m-1;n>=0;n--) if (s[n]>maxeig) maxeig=s[n];
	for (n=m-1;n>=0;n--)
	{
		denom=maxeig+lambda-s[n];
		if (denom>0) utg[n]/=denom; else utg[n]=0;
	}
	REAL *x=matrixtmulvector(v,utg);
	array_1d_free(utg);
	return x;
}

REAL *svdquadmax(const REAL *g,const REAL * const *h,const REAL lambda=0)
{ // f(x)=f(0)+g.x+(1/2)x^T.H.x, find arg max_x f(x) for |x|<=r(lambda)
	REAL **u,**v,*s=svdsym(h,&u,&v),
		*ret=svdquadmax_inner(s,u,v,g,lambda);
	array_1d_free(s); array_2d_free(u); array_2d_free(v);
	return ret;
}

#define SBCLIB_SVD
#endif
