#ifndef SBCLIB_PARALLEL
// Simple parallelisation library for 'for' loops
//#define PARALLEL_TIMES // Records start and end times with parallel1

#if defined(__linux__) || defined(__APPLE__)
#ifdef __APPLE__
#include <pthread.h> // This pthreads version apparently works everywhere
#else // Linux
#include <threads.h>
#endif
#include <sys/sysinfo.h>
#else // Windows
#include <windows.h>
#include <rnd/Maxu.c>
#endif
typedef struct {int n0,n1,thread; void *args,*out;} ParallelInfo;

unsigned parallel_threads(void)
{
	static unsigned cores=0;
	if (!cores)
	{
		#if defined(__linux__) || defined(__APPLE__)
		cores=get_nprocs();
		#else // Windows
		SYSTEM_INFO si; GetSystemInfo(&si);
		cores=Maxu(1,si.dwNumberOfProcessors);
		#endif
	}
	return cores;
}

void *parallel(unsigned f(ParallelInfo *),const int n0,const int n1,
	void *args=NULL,const unsigned output_sz=0,unsigned threads=0,const int async=0)
{ // If output_sz>0, returns an array of 'threads' structures of size 'output_sz' bytes each
	int i; unsigned long mallards; if (!threads) threads=parallel_threads();
	#if defined(__linux__)
	thrd_t *h=(thrd_t *)malloc(threads*sizeof(thrd_t));
	#elif defined(__APPLE__)
	pthread_t *h=(pthread_t *)malloc(threads*sizeof(pthread_t));
	#else // Windows
	HANDLE *h=(HANDLE *)malloc(threads*sizeof(HANDLE));
	#endif
	void *ret=NULL; if (output_sz) ret=malloc(threads*output_sz);
	ParallelInfo *a=(ParallelInfo *)malloc(threads*sizeof(ParallelInfo));
	for (i=threads-1;i>=0;i--)
	{
		a[i].n0=n0+(int)((double)(n1-n0)*i/threads);
		if (i<threads-1) a[i].n1=a[i+1].n0; else a[i].n1=n1;
		a[i].thread=i; a[i].args=args; a[i].out=(unsigned char *)ret+i*output_sz;
		if (i || async)
		{
			#if defined(__linux__)
			thrd_create(&h[i],(thrd_start_t)f,a+i);
			#elif defined(__APPLE__)
			pthread_create(&h[i],NULL,(void *(*)(void *))f,a+i);
			#else // Windows
			h[i]=CreateThread(NULL,0,(LPTHREAD_START_ROUTINE)f,a+i,0,&mallards); // These mallards are not used
			#endif
		}
		else f(a); // Run in thread 0
	}
	if (threads>1 && !async)
	{
		#if defined(__linux__)
		for (i=threads-1;i;i--) thrd_join(h[i],NULL);
		#elif defined(__APPLE__)
		for (i=threads-1;i;i--) pthread_join(h[i],NULL);
		#else // Windows
		WaitForMultipleObjects(threads-1,h+1,TRUE,INFINITE);
		#endif
	}
	if (!async) // Leaky /**/ (I think async=1 is leaky because h and a aren't freed?)
	{
		#if defined(__linux__) || defined(__APPLE__)
		// Nothing
		#else // Windows
		if (async) CloseHandle(h[0]);
		for (i=threads-1;i;i--) CloseHandle(h[i]);
		#endif
		free(h); free(a);
	}
	return ret;
}

#include <float.h>

void parallel_init_thread(void)
{ // You call this at the start of the thread function to de-mank the FPU
	#if defined(__linux__) || defined(__APPLE__)
	/**/
	#else // Windows
	_controlfp(_PC_64,_MCW_PC);
	#endif
}

// Routines for filling a cuboidal array of function values
#ifdef SBCLIB_ARRAY_2D
typedef struct {REAL **a; void (*f)(REAL,REAL,REAL *,const void *); const void *fargs; REAL x0,y0,dx,dy;} array_2d_fill_args;

unsigned array_2d_fill_thread(ParallelInfo *pa)
{
	parallel_init_thread();
	array_2d_fill_args *args=(array_2d_fill_args *)pa->args;
	REAL **a=args->a,x0=args->x0,y0=args->y0,dx=args->dx,dy=args->dy;
	void (*f)(REAL,REAL,REAL *,const void *)=args->f; const void *fargs=args->fargs;
	int n,x,y,X=array_2d_xs(a),Y=array_2d_ys(a);
	for (n=pa->n0;n<pa->n1;n++)
	{
		x=n/Y; y=n%Y;
		f(x0+x*dx,y0+y*dy,&a[x][y],fargs);
	}
	return 0;
}

void array_2d_fill(REAL **a,void f(REAL,REAL,REAL *,const void *),const void *fargs=NULL,
	const REAL x0=0,const REAL y0=0,const REAL dx=1,const REAL dy=1)
{
	array_2d_fill_args args; args.a=a; args.f=f; args.fargs=fargs;
	args.x0=x0; args.y0=y0; args.dx=dx; args.dy=dy;
	parallel(array_2d_fill_thread,0,array_2d_xs(a)*array_2d_ys(a),&args);
}
#endif

#ifdef SBCLIB_ARRAY_3D
typedef struct {REAL ***a; void (*f)(REAL,REAL,REAL,REAL *,const void *); const void *fargs; REAL x0,y0,z0,dx,dy,dz;} array_3d_fill_args;

unsigned array_3d_fill_thread(ParallelInfo *pa)
{
	parallel_init_thread();
	array_3d_fill_args *args=(array_3d_fill_args *)pa->args;
	REAL ***a=args->a,x0=args->x0,y0=args->y0,z0=args->z0,dx=args->dx,dy=args->dy,dz=args->dz;
	void (*f)(REAL,REAL,REAL,REAL *,const void *)=args->f; const void *fargs=args->fargs;
	int n,x,y,z,X=array_3d_xs(a),Y=array_3d_ys(a),Z=array_3d_zs(a);
	for (n=pa->n0;n<pa->n1;n++)
	{
		x=n/(Y*Z); y=(n/Z)%Y; z=n%Z;
		f(x0+x*dx,y0+y*dy,z0+z*dz,&a[x][y][z],fargs);
	}
	return 0;
}

void array_3d_fill(REAL ***a,void f(REAL,REAL,REAL,REAL *,const void *),const void *fargs=NULL,
	const REAL x0=0,const REAL y0=0,const REAL z0=0,const REAL dx=1,const REAL dy=1,const REAL dz=1)
{
	array_3d_fill_args args; args.a=a; args.f=f; args.fargs=fargs;
	args.x0=x0; args.y0=y0; args.z0=z0; args.dx=dx; args.dy=dy; args.dz=dz;
	parallel(array_3d_fill_thread,0,array_3d_xs(a)*array_3d_ys(a)*array_3d_zs(a),&args);
}

typedef struct {void ***a; int sz; void (*f)(REAL,REAL,REAL,void *,const void *); const void *fargs; REAL x0,y0,z0,dx,dy,dz;} array_3d_type_fill_args;

unsigned array_3d_type_fill_thread(ParallelInfo *pa)
{
	parallel_init_thread();
	array_3d_type_fill_args *args=(array_3d_type_fill_args *)pa->args;
	void ***a=args->a; int sz=args->sz; REAL x0=args->x0,y0=args->y0,z0=args->z0,dx=args->dx,dy=args->dy,dz=args->dz;
	void (*f)(REAL,REAL,REAL,void *,const void *)=args->f; const void *fargs=args->fargs;
	int n,x,y,z,X=array_3d_xs(a),Y=array_3d_ys(a),Z=array_3d_zs(a);
	for (n=pa->n0;n<pa->n1;n++)
	{
		x=n/(Y*Z); y=(n/Z)%Y; z=n%Z;
		f(x0+x*dx,y0+y*dy,z0+z*dz,(unsigned char *)a[x][y]+z*sz,fargs);
	}
	return 0;
}

void array_3d_type_fill(void ***a,const int sz,void f(REAL,REAL,REAL,void *,const void *),const void *fargs=NULL,
	const REAL x0=0,const REAL y0=0,const REAL z0=0,const REAL dx=1,const REAL dy=1,const REAL dz=1)
{
	array_3d_type_fill_args args; args.a=a; args.sz=sz; args.f=f; args.fargs=fargs;
	args.x0=x0; args.y0=y0; args.z0=z0; args.dx=dx; args.dy=dy; args.dz=dz;
	parallel(array_3d_type_fill_thread,0,array_3d_xs(a)*array_3d_ys(a)*array_3d_zs(a),&args);
}
#endif

#define SBCLIB_PARALLEL
#endif
