-
Notifications
You must be signed in to change notification settings - Fork 1
/
cuda_define.m
180 lines (167 loc) · 11.7 KB
/
cuda_define.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
% cuda_define(FktName, FktType, Comment, CoreCommandsReal, CoreCommandsCpxReal, CoreCommandsRealCpx, CoreCommandsCpxCpx) : defines a function into the system of cuda commands that can be executed on NVidia Graphic cards
% FktName : Name of the function
% FktType :
% 'CUDA_UnaryFkt': A cuda function that takes one array (a) as input and has one result array
% 'CUDA_UnaryFktConst': Takes one array (a) and an additional scalar real value (b) as input and has one result array.
% 'CUDA_UnaryFktConstC': Takes one array (a) and an additional complex value (real part:br complex part: bi) as input and has one result array.
% 'CUDA_BinaryFkt' : Takes two arrays (a and b) as input and has one (c) as output
% 'CUDA_3DAsgFkt' : Takes one scalar real or complex (br and bi) and has out output array (c).
%
% CoreCommandsReal : C/cuda - commands that are executed for every computing core for real valued input
% CoreCommandsRealCpx : C/cuda - commands that are executed for every computing core for complex valued input (one input) or a complex and real array
% CoreCommandsCpxReal : C/cuda - commands that are executed for every computing core for complex valued input
% CoreCommandsCpxCpx : C/cuda - commands that are executed for every computing core for complex valued input
%
% the following variables are available:
% a : first input array
% b : second input array (binary only)
%
% c : output array
% idx : index running through the array
% idd : index in the destination array (only for functions with subarray indexing)
% x,y,z : pixel coordinates (only for FuntionType CUDA_3DFkt)
%
% Attention: The C-code must not contain any komma, as this upsets the macros with arguments and will thus not compile!
%
% all defined function text is saved in a global variable "cuda_to_compile" which can be cleared by
% cuda_to_compile=[];
%
% see also: cuda_compile_all
%
% example :
%
% cuda_define('myadd','CUDA_BinaryFkt','Adds two arrays together.','c[idx]=a[idx]+b[idx];','c[2*idx]=a[2*idx]+b[idx];c[2*idx+1]=a[2*idx+1];','c[2*idx]=a[idx]+b[2*idx];c[2*idx+1]=b[2*idx+1];','c[2*idx]=a[2*idx]+b[2*idx];c[2*idx+1]=a[2*idx+1]+b[2*idx+1];');
% cuda_compile_all();
% mytst=myadd(readim,readim('orka'))
% whos
%
% see also : the code of the appleman program for argument appleman(2)
function cuda_define(FktName, FktType, Comment, CoreCommandsReal, CoreCommandsCpxReal, CoreCommandsRealCpx, CoreCommandsCpxCpx, NArgs)
% 'CUDA_MaskIdx': Takes one array and a mask index vector as input. The latter can be used to modify the result inside the mask
% 'CUDA_PartRedMask' : Takes one array (a) as input and reduces it to a scalar
% 'CUDA_PartRedMaskCpx' : Takes a complex array (a) as input and reduces it to a complex scalar
% 'CUDA_PartRedMaskIdx' : Takes an array (a) and finds an index
%CUDA_PartRedMask(psum_arr,mysum)
%CUDA_PartRedMaskCpx(psum_carr,mysum)
%CUDA_PartRedMaskIdx(pmax_arr,maxCond)
%CUDA_3DAsgFkt(const_subcpy_arr,c[idd]=br;)
% CUDA_3DFkt(arr_subcpy_arr,c[idd]=a[ids];)
% CUDA_BinaryFkt(arr_power_arr,c[idx]=pow(a[idx],b[idx]);)
% CUDA_MaskIdx(arr_subsref_arr,c[mask_idx]=a[idx];)
% CUDA_UnaryFktConstC(arr_unequals_Cconst,c[idx]=(a[idx]!=br) || (bi!=0);)
% CUDA_UnaryFktConstC(Cconst_unequals_arr,c[idx]=(br!=a[idx]) || (bi!=0);)
% CUDA_BinaryFkt(arr_unequals_arr,c[idx]=(a[idx]!=b[idx]);)
if nargin < 5 || isempty(CoreCommandsCpxReal)
CoreCommandsCpxReal='';
end
if nargin < 6 || isempty(CoreCommandsRealCpx)
CoreCommandsRealCpx='';
end
if nargin < 7 || isempty(CoreCommandsCpxCpx)
CoreCommandsCpxCpx='';
end
if nargin < 8 || isempty(NArgs)
NArgs = 1;
end
switch(FktType)
case 'CUDA_UnaryFkt' % A cuda function that takes one array (a) as input and has one result array.
m_snippet = sprintf('%% out = %s(in1) : User defined cuda function. %s\n',FktName,Comment);
m_snippet = sprintf('%s function out = %s(in1)\n',m_snippet,FktName);
m_snippet = sprintf('%s if isa(in1,''cuda'') \n',m_snippet);
m_snippet = sprintf('%s ref=cuda_cuda(''%s'',getReference(in1)); \n',m_snippet,FktName);
m_snippet = sprintf('%s else \n',m_snippet);
m_snippet = sprintf('%s error(''%s: Unknown datatype''); \n',m_snippet,FktName);
m_snippet = sprintf('%s end \n',m_snippet);
m_snippet = sprintf('%s out=cuda(ref,getFromDip(in1)); \n',m_snippet);
c_snippet = sprintf('else if (strcmp(command,"%s")==0) \n{ if (nrhs != 2) mexErrMsgTxt("cuda: %s needs two arguments"); \n', FktName,FktName);
c_snippet = sprintf('%s if (isComplexType(getCudaRefNum(prhs[1]))) \n', c_snippet);
c_snippet = sprintf('%s CUDA%s_carr(getCudaRef(prhs[1]),cudaAllocReal(prhs[1]),getTotalSizeFromRef(prhs[1])); \n',c_snippet,FktName);
c_snippet = sprintf('%s else \n',c_snippet);
c_snippet = sprintf('%s CUDA%s_arr(getCudaRef(prhs[1]),cudaAllocReal(prhs[1]),getTotalSizeFromRef(prhs[1])); \n',c_snippet,FktName);
c_snippet = sprintf('%s if (nlhs > 0) \n',c_snippet);
c_snippet = sprintf('%s plhs[0] = mxCreateDoubleScalar((double)free_array); \n',c_snippet);
c_snippet = sprintf('%s Dbg_printf("cuda: %s\");\n} \n',c_snippet,FktName);
cu_snippet = sprintf('%s(%s_arr,%s) \n%s(%s_carr,%s)\n',FktType,FktName,CoreCommandsReal,FktType,FktName,CoreCommandsCpxReal);
h_snippet = sprintf('externC const char * CUDA%s_arr(float * a, float * c, size_t N);\nexternC const char * CUDA%s_carr(float * a, float * c, size_t N);\n\n',FktName,FktName);
case 'CUDA_BinaryFkt' % Takes two arrays (a and b) as input and has one (c) as output
m_snippet = sprintf('%% out = %s(in1,in2) : User defined cuda function. %s\n',FktName,Comment);
m_snippet = sprintf('%s function out = %s(in1,in2)\n',m_snippet,FktName);
m_snippet = sprintf('%s if prod(size(in1)) > 1 && ~isa(in1,''cuda'') \n in1=cuda(in1); \nend\n',m_snippet);
m_snippet = sprintf('%s if prod(size(in2)) > 1 && ~isa(in2,''cuda'') \n in2=cuda(in2); \nend\n',m_snippet);
m_snippet = sprintf('%s if isa(in1,''cuda'') && isa(in2,''cuda'')\n',m_snippet);
m_snippet = sprintf('%s ref=cuda_cuda(''%s'',getReference(in1),getReference(in2)); \n',m_snippet,FktName);
m_snippet = sprintf('%s else \n',m_snippet);
m_snippet = sprintf('%s error(''%s: Needs two cuda arrays as input''); \n',m_snippet,FktName);
m_snippet = sprintf('%s end \n',m_snippet);
m_snippet = sprintf('%s out=cuda(ref,in1.fromDip || in2.fromDip); \n',m_snippet);
c_snippet = sprintf('else if (strcmp(command,"%s")==0) {\n CallCUDA_BinaryFkt(%s,cudaAllocSized)\n', FktName,FktName);
c_snippet = sprintf('%s if (nlhs > 0) \n',c_snippet);
c_snippet = sprintf('%s plhs[0] = mxCreateDoubleScalar((double)free_array); \n',c_snippet);
c_snippet = sprintf('%s Dbg_printf("cuda: %s\");\n} \n',c_snippet,FktName);
cu_snippet = sprintf('%s(arr_%s_arr,%s) \n%s(arr_%s_carr,%s)\n%s(carr_%s_arr,%s)\n%s(carr_%s_carr,%s)\n',FktType,FktName,CoreCommandsReal,FktType,FktName,CoreCommandsRealCpx,FktType,FktName,CoreCommandsCpxReal,FktType,FktName,CoreCommandsCpxCpx);
h_snippet = sprintf('externC const char * CUDAarr_%s_arr(float * a, float * b, float * c, size_t N);\nexternC const char * CUDAarr_%s_carr(float * a, float * b, float * c, size_t N);\nexternC const char * CUDAcarr_%s_arr(float * a, float * b, float * c, size_t N);\nexternC const char * CUDAcarr_%s_carr(float * a, float * b, float * c, size_t N);\n\n',FktName,FktName,FktName,FktName);
case 'CUDA_NArgsFkt' % Takes any number of arrays (f1 ... Fn) as input and has one (c) as output
AllArgs = sprintf('f%d,',[1:NArgs]); % f1,f2,f3, ...
AllArgs(end)=[];
m_snippet = sprintf('%% [fwd,grad] = %s(%s) : User defined cuda function. %s\n',FktName,AllArgs,Comment);
m_snippet = sprintf('%s function [fwd,grad] = %s(%s)\n',m_snippet,FktName,AllArgs);
m_snippet = sprintf('%s allcuda=1;anyDip=0;\n',m_snippet);
for n=1:NArgs
m_snippet = sprintf('%s if prod(size(f%d)) > 1 && ~isa(f%d,''cuda'') \n error(''%s: Automatic conversion to cuda not allowed due to in-place operations.'');f%d=cuda(f%d); \n end\n',m_snippet,n,n,FktName,n,n);
m_snippet = sprintf('%s allcuda= allcuda && isa(f%d,''cuda'');\n',m_snippet,n);
m_snippet = sprintf('%s anyDip= anyDip || f%d.fromDip;\n',m_snippet,n);
end
m_snippet = sprintf('%s if allcuda \n',m_snippet);
AllArgsRef = sprintf('getReference(f%d),',[1:NArgs]); % f1,f2,f3, ...
AllArgsRef(end)=[];
m_snippet = sprintf('%s ref=cuda_cuda(''%s'',%s); \n',m_snippet,FktName,AllArgsRef);
m_snippet = sprintf('%s else \n',m_snippet);
m_snippet = sprintf('%s error(''%s: Needs arrays to be cuda as input''); \n',m_snippet,FktName);
m_snippet = sprintf('%s end \n',m_snippet);
m_snippet = sprintf('%s grad=cuda(ref,anyDip); \n',m_snippet);
m_snippet = sprintf('%s fwd=f1; %% this was modified in the cuda code \n',m_snippet);
c_snippet = sprintf('else if (strcmp(command,"%s")==0) {\n CallCUDA_NArgsFkt(%s,cudaAllocSized,%d)\n', FktName,FktName,NArgs);
c_snippet = sprintf('%s if (nlhs > 0) \n',c_snippet);
c_snippet = sprintf('%s plhs[0] = mxCreateDoubleScalar((double)free_array); \n',c_snippet);
c_snippet = sprintf('%s Dbg_printf("cuda: %s\");\n} \n',c_snippet,FktName);
cu_snippet = sprintf('%s(arr_%s_NArgs,%s,%d) \n',FktType,FktName,CoreCommandsReal,NArgs); % the cuda-code has to be able to decide about complex or not inside each kernel!
h_snippet = sprintf('externC const char * CUDAarr_%s_NArgs(float * f[%d], float * c, size_t N, int numdims, SizeND sizesC, BoolND isSingleton[%d]);\n',FktName,NArgs,NArgs);
otherwise
error('cuda_define: Unknown Function Type!');
end
global cuda_to_compile;
if exist('cuda_to_compile') && ~isempty(cuda_to_compile)
myindex=find(strcmp(cuda_to_compile.name,FktName));
if isempty(myindex) % This function has not already been defined
myindex=length(cuda_to_compile.name)+1; % append a new element
cuda_to_compile.needsRecompile=1;
else
fprintf('WARNING: Redefining cuda function %s, which was already defined\n',FktName);
end
else
myindex = 1;
end
cuda_to_compile.name{myindex} = FktName;
cuda_to_compile.m_snippet{myindex} = m_snippet;
cuda_to_compile.c_snippet{myindex} = c_snippet;
cuda_to_compile.cu_snippet{myindex} = cu_snippet;
cuda_to_compile.h_snippet{myindex} = h_snippet;
if ~isfield(cuda_to_compile,'needsRecompile')
cuda_to_compile.needsRecompile=0;
end
if cuda_to_compile.needsRecompile || ~isfield(cuda_to_compile,'coreCommandsReal') || ~strcmp(cuda_to_compile.coreCommandsReal{myindex},CoreCommandsReal)
cuda_to_compile.coreCommandsReal{myindex} = CoreCommandsReal;
cuda_to_compile.needsRecompile = 1;
end
if cuda_to_compile.needsRecompile || ~isfield(cuda_to_compile,'coreCommandsCpxReal') || ~strcmp(cuda_to_compile.coreCommandsCpxReal{myindex},CoreCommandsCpxReal)
cuda_to_compile.coreCommandsCpxReal{myindex} = CoreCommandsCpxReal;
cuda_to_compile.needsRecompile = 1;
end
if cuda_to_compile.needsRecompile || ~isfield(cuda_to_compile,'coreCommandsRealCpx') || ~strcmp(cuda_to_compile.coreCommandsRealCpx{myindex},CoreCommandsRealCpx)
cuda_to_compile.coreCommandsRealCpx{myindex} = CoreCommandsRealCpx;
cuda_to_compile.needsRecompile = 1;
end
if cuda_to_compile.needsRecompile || ~isfield(cuda_to_compile,'coreCommandsCpxCpx') || ~strcmp(cuda_to_compile.coreCommandsCpxCpx{myindex},CoreCommandsCpxCpx)
cuda_to_compile.coreCommandsCpxCpx{myindex} = CoreCommandsCpxCpx;
cuda_to_compile.needsRecompile = 1;
end