#ifdef UTIL
#include "GPU_Util.cl" //This only works if you also add the include path in a -I flag during compile
#endif
/*#ifndef TYPE
#error TYPE not defined! //You can display an error message during compile time.
#endif*/
//#define VAR 2 //This isn't valid here, use constant int, OR the -D input flag, or we can add our own find/replace step before compiling.
//constant int FOUR = 2*2;//Arithmetic can be computed in constants just fine.
//Examples: https://github.com/vpa1977/streamgpu/tree/master/gpu_knn/src/org/stream_gpu/knn

__kernel void   testparams(const   short3 W,  private short   H,   local char * srcData, global int *destData, long length, local float* points, float precise) {
 	int i = get_global_id(0); // get index into global data array

 	destData[i] = (int)pythag(4.0 ,3.0);//Needs util.h header file for definition
}
/*----------------------------------------------------------*//**

*//*-----------------------------------------------------------*/
__kernel void   benchBarrier  (const   int W,  const int   H,   __global int* srcData, __global int* destData) {
 	int i = get_global_id(0); // get index into global data array

 	destData[i] = (int)pythag(4.0 ,3.0);//Needs util.h header file for definition
}
/*----------------------------------------------------------*//**

*//*-----------------------------------------------------------*/
__kernel void scalarAdd(int amount, __global int* data) {
 	int i = get_global_id(0); // get index into global data array
 	//for (int i = 0; i < 200; i++)
 		data[i] += amount;
}
/*----------------------------------------------------------*//**

*//*-----------------------------------------------------------*/
__kernel void benchMult(private const int amount, __global int* data) {
 	int i = get_global_id(0); // get index into global data array
 	data[i] *= amount;
}
/*----------------------------------------------------------*//**
Add srcData onto destInto.
*//*-----------------------------------------------------------*/
__kernel void add(__global int* destData, __global int* srcData) {
 	int i = get_global_id(0); // get index into global data array
 	//for (int i = 0; i < 200; i++)
 		destData[i] += srcData[i];
}
/*----------------------------------------------------------*//**
Add srcData1 + srcData2 into destInto.
*//*-----------------------------------------------------------*/
__kernel void addArrays(__global float* srcData1, __global float* srcData2, __global float* destData) {
 	int i = get_global_id(0); // get index into global data array
 	destData[i] = srcData1[i] + srcData2[i];
}
/*----------------------------------------------------------*//**
NOTE: When using __local is an input arg, it MUST be set as NULL ((setNullArg) and its size must be set from the CPU,
which won't know the size returned by get_local_size() a priori. Local memory is faster than global memory, which would
be useful in situations solving sub-matrices of a larger matrix for example.
*//*-----------------------------------------------------------*/
__kernel void benchLocal(__global int* dataGlobal, __local int* dataLocal) {
 	int j = get_local_id(0); // get index into local data array
	dataLocal[j] = get_local_size(0);
	int i = get_global_id(0); // get index into global data array
	dataGlobal[i] = dataLocal[j]; //Copy local array to global array
}

/*----------------------------------------------------------*//**
Sum all the elements of a *large* vector (the overhead costs will make this
slower on small vectors!). Note that local allocation sizes must be specified in BYTES!!!
More details: https://dournac.org/info/gpu_sum_reduction
*//*-----------------------------------------------------------*/
__kernel void reduceSum(__global const int *input,
						__global int *partialSums,
						__local  int *localSums) {
	int local_id   = get_local_id(0);
	int group_size = get_local_size(0);

	localSums[local_id] = input[get_global_id(0)]; // Copy from global to local memory
	/*input[get_global_id(0)] = local_id;
	localSums[255] = 1;*/
	// Loop for computing localSums : divide WorkGroup into 2 parts
	for (int stride = group_size/2; stride > 0; stride /= 2) {
		barrier(CLK_LOCAL_MEM_FENCE); // Waiting for each 2x2 addition into given workgroup
		if (local_id < stride) // Add elements 2 by 2 between local_id and local_id + stride
			localSums[local_id] += localSums[local_id + stride];
	}
	// Write result into partialSums[nWorkGroups]
	if (local_id == 0)
		//input[get_group_id(0)] = localSums[0]; //write the result into the beginning of the global memory
		partialSums[get_group_id(0)] = localSums[0];
 }
