1
+ #include < stdio.h>
2
+ #include < opencv2/opencv.hpp>
3
+
4
+ using namespace std ;
5
+ using namespace cv ;
6
+
7
+ inline unsigned char IM_ClampToByte (int Value)
8
+ {
9
+ if (Value < 0 )
10
+ return 0 ;
11
+ else if (Value > 255 )
12
+ return 255 ;
13
+ else
14
+ return (unsigned char )Value;
15
+ // return ((Value | ((signed int)(255 - Value) >> 31)) & ~((signed int)Value >> 31));
16
+ }
17
+
18
+ void Sobel_FLOAT (unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride) {
19
+ int Channel = Stride / Width;
20
+ unsigned char *RowCopy = (unsigned char *)malloc ((Width + 2 ) * 3 * Channel);
21
+ unsigned char *First = RowCopy;
22
+ unsigned char *Second = RowCopy + (Width + 2 ) * Channel;
23
+ unsigned char *Third = RowCopy + (Width + 2 ) * 2 * Channel;
24
+ // 拷贝第二行数据,边界值填充
25
+ memcpy (Second, Src, Channel);
26
+ memcpy (Second + Channel, Src, Width*Channel);
27
+ memcpy (Second + (Width + 1 )*Channel, Src + (Width - 1 )*Channel, Channel);
28
+ // 第一行和第二行一样
29
+ memcpy (First, Second, (Width + 2 ) * Channel);
30
+ // 拷贝第三行数据,边界值填充
31
+ memcpy (Third, Src + Stride, Channel);
32
+ memcpy (Third + Channel, Src + Stride, Width * Channel);
33
+ memcpy (Third + (Width + 1 ) * Channel, Src + Stride + (Width - 1 ) * Channel, Channel);
34
+
35
+ for (int Y = 0 ; Y < Height; Y++) {
36
+ unsigned char *LinePS = Src + Y * Stride;
37
+ unsigned char *LinePD = Dest + Y * Stride;
38
+ if (Y != 0 ) {
39
+ unsigned char *Temp = First;
40
+ First = Second;
41
+ Second = Third;
42
+ Third = Temp;
43
+ }
44
+ if (Y == Height - 1 ) {
45
+ memcpy (Third, Second, (Width + 2 ) * Channel);
46
+ }
47
+ else {
48
+ memcpy (Third, Src + (Y + 1 ) * Stride, Channel);
49
+ memcpy (Third + Channel, Src + (Y + 1 ) * Stride, Width * Channel); // 由于备份了前面一行的数据,这里即使Src和Dest相同也是没有问题的
50
+ memcpy (Third + (Width + 1 ) * Channel, Src + (Y + 1 ) * Stride + (Width - 1 ) * Channel, Channel);
51
+ }
52
+ if (Channel == 1 ) {
53
+ for (int X = 0 ; X < Width; X++)
54
+ {
55
+ int GX = First[X] - First[X + 2 ] + (Second[X] - Second[X + 2 ]) * 2 + Third[X] - Third[X + 2 ];
56
+ int GY = First[X] + First[X + 2 ] + (First[X + 1 ] - Third[X + 1 ]) * 2 - Third[X] - Third[X + 2 ];
57
+ LinePD[X] = IM_ClampToByte (sqrtf (GX * GX + GY * GY + 0 .0F ));
58
+ }
59
+ }
60
+ else
61
+ {
62
+ for (int X = 0 ; X < Width * 3 ; X++)
63
+ {
64
+ int GX = First[X] - First[X + 6 ] + (Second[X] - Second[X + 6 ]) * 2 + Third[X] - Third[X + 6 ];
65
+ int GY = First[X] + First[X + 6 ] + (First[X + 3 ] - Third[X + 3 ]) * 2 - Third[X] - Third[X + 6 ];
66
+ LinePD[X] = IM_ClampToByte (sqrtf (GX * GX + GY * GY + 0 .0F ));
67
+ }
68
+ }
69
+ }
70
+ free (RowCopy);
71
+ }
72
+
73
+ void Sobel_INT (unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride) {
74
+ int Channel = Stride / Width;
75
+ unsigned char *RowCopy = (unsigned char *)malloc ((Width + 2 ) * 3 * Channel);
76
+ unsigned char *First = RowCopy;
77
+ unsigned char *Second = RowCopy + (Width + 2 ) * Channel;
78
+ unsigned char *Third = RowCopy + (Width + 2 ) * 2 * Channel;
79
+ // 拷贝第二行数据,边界值填充
80
+ memcpy (Second, Src, Channel);
81
+ memcpy (Second + Channel, Src, Width*Channel);
82
+ memcpy (Second + (Width + 1 )*Channel, Src + (Width - 1 )*Channel, Channel);
83
+ // 第一行和第二行一样
84
+ memcpy (First, Second, (Width + 2 ) * Channel);
85
+ // 拷贝第三行数据,边界值填充
86
+ memcpy (Third, Src + Stride, Channel);
87
+ memcpy (Third + Channel, Src + Stride, Width * Channel);
88
+ memcpy (Third + (Width + 1 ) * Channel, Src + Stride + (Width - 1 ) * Channel, Channel);
89
+
90
+ unsigned char Table[65026 ];
91
+ for (int Y = 0 ; Y < 65026 ; Y++) Table[Y] = (sqrtf (Y + 0 .0f ) + 0 .5f );
92
+ for (int Y = 0 ; Y < Height; Y++) {
93
+ unsigned char *LinePS = Src + Y * Stride;
94
+ unsigned char *LinePD = Dest + Y * Stride;
95
+ if (Y != 0 ) {
96
+ unsigned char *Temp = First;
97
+ First = Second;
98
+ Second = Third;
99
+ Third = Temp;
100
+ }
101
+ if (Y == Height - 1 ) {
102
+ memcpy (Third, Second, (Width + 2 ) * Channel);
103
+ }
104
+ else {
105
+ memcpy (Third, Src + (Y + 1 ) * Stride, Channel);
106
+ memcpy (Third + Channel, Src + (Y + 1 ) * Stride, Width * Channel); // 由于备份了前面一行的数据,这里即使Src和Dest相同也是没有问题的
107
+ memcpy (Third + (Width + 1 ) * Channel, Src + (Y + 1 ) * Stride + (Width - 1 ) * Channel, Channel);
108
+ }
109
+ if (Channel == 1 ) {
110
+ for (int X = 0 ; X < Width; X++)
111
+ {
112
+ int GX = First[X] - First[X + 2 ] + (Second[X] - Second[X + 2 ]) * 2 + Third[X] - Third[X + 2 ];
113
+ int GY = First[X] + First[X + 2 ] + (First[X + 1 ] - Third[X + 1 ]) * 2 - Third[X] - Third[X + 2 ];
114
+ LinePD[X] = Table[min (GX * GX + GY * GY, 65025 )];
115
+ }
116
+ }
117
+ else
118
+ {
119
+ for (int X = 0 ; X < Width * 3 ; X++)
120
+ {
121
+ int GX = First[X] - First[X + 6 ] + (Second[X] - Second[X + 6 ]) * 2 + Third[X] - Third[X + 6 ];
122
+ int GY = First[X] + First[X + 6 ] + (First[X + 3 ] - Third[X + 3 ]) * 2 - Third[X] - Third[X + 6 ];
123
+ LinePD[X] = Table[min (GX * GX + GY * GY, 65025 )];
124
+ }
125
+ }
126
+ }
127
+ free (RowCopy);
128
+ }
129
+
130
+ void Sobel_SSE (unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride) {
131
+ int Channel = Stride / Width;
132
+ unsigned char *RowCopy = (unsigned char *)malloc ((Width + 2 ) * 3 * Channel);
133
+ unsigned char *First = RowCopy;
134
+ unsigned char *Second = RowCopy + (Width + 2 ) * Channel;
135
+ unsigned char *Third = RowCopy + (Width + 2 ) * 2 * Channel;
136
+ // 拷贝第二行数据,边界值填充
137
+ memcpy (Second, Src, Channel);
138
+ memcpy (Second + Channel, Src, Width*Channel);
139
+ memcpy (Second + (Width + 1 )*Channel, Src + (Width - 1 )*Channel, Channel);
140
+ // 第一行和第二行一样
141
+ memcpy (First, Second, (Width + 2 ) * Channel);
142
+ // 拷贝第三行数据,边界值填充
143
+ memcpy (Third, Src + Stride, Channel);
144
+ memcpy (Third + Channel, Src + Stride, Width * Channel);
145
+ memcpy (Third + (Width + 1 ) * Channel, Src + Stride + (Width - 1 ) * Channel, Channel);
146
+
147
+ int BlockSize = 8 , Block = (Width * Channel) / BlockSize;
148
+
149
+ unsigned char Table[65026 ];
150
+ for (int Y = 0 ; Y < 65026 ; Y++) Table[Y] = (sqrtf (Y + 0 .0f ) + 0 .5f );
151
+ for (int Y = 0 ; Y < Height; Y++) {
152
+ unsigned char *LinePS = Src + Y * Stride;
153
+ unsigned char *LinePD = Dest + Y * Stride;
154
+ if (Y != 0 ) {
155
+ unsigned char *Temp = First;
156
+ First = Second;
157
+ Second = Third;
158
+ Third = Temp;
159
+ }
160
+ if (Y == Height - 1 ) {
161
+ memcpy (Third, Second, (Width + 2 ) * Channel);
162
+ }
163
+ else {
164
+ memcpy (Third, Src + (Y + 1 ) * Stride, Channel);
165
+ memcpy (Third + Channel, Src + (Y + 1 ) * Stride, Width * Channel); // 由于备份了前面一行的数据,这里即使Src和Dest相同也是没有问题的
166
+ memcpy (Third + (Width + 1 ) * Channel, Src + (Y + 1 ) * Stride + (Width - 1 ) * Channel, Channel);
167
+ }
168
+ if (Channel == 1 ) {
169
+ for (int X = 0 ; X < Width; X++)
170
+ {
171
+ int GX = First[X] - First[X + 2 ] + (Second[X] - Second[X + 2 ]) * 2 + Third[X] - Third[X + 2 ];
172
+ int GY = First[X] + First[X + 2 ] + (First[X + 1 ] - Third[X + 1 ]) * 2 - Third[X] - Third[X + 2 ];
173
+ // LinePD[X] = Table[min(GX * GX + GY * GY, 65025)];
174
+ }
175
+ }
176
+ else
177
+ {
178
+ __m128i Zero = _mm_setzero_si128 ();
179
+ for (int X = 0 ; X < Block * BlockSize; X += BlockSize)
180
+ {
181
+ __m128i FirstP0 = _mm_unpacklo_epi8 (_mm_loadl_epi64 ((__m128i *)(First + X)), Zero);
182
+ __m128i FirstP1 = _mm_unpacklo_epi8 (_mm_loadl_epi64 ((__m128i *)(First + X + 3 )), Zero);
183
+ __m128i FirstP2 = _mm_unpacklo_epi8 (_mm_loadl_epi64 ((__m128i *)(First + X + 6 )), Zero);
184
+
185
+ __m128i SecondP0 = _mm_unpacklo_epi8 (_mm_loadl_epi64 ((__m128i *)(Second + X)), Zero);
186
+ __m128i SecondP2 = _mm_unpacklo_epi8 (_mm_loadl_epi64 ((__m128i *)(Second + X + 6 )), Zero);
187
+
188
+ __m128i ThirdP0 = _mm_unpacklo_epi8 (_mm_loadl_epi64 ((__m128i *)(Third + X)), Zero);
189
+ __m128i ThirdP1 = _mm_unpacklo_epi8 (_mm_loadl_epi64 ((__m128i *)(Third + X + 3 )), Zero);
190
+ __m128i ThirdP2 = _mm_unpacklo_epi8 (_mm_loadl_epi64 ((__m128i *)(Third + X + 6 )), Zero);
191
+
192
+ __m128i GX16 = _mm_abs_epi16 (_mm_add_epi16 (_mm_add_epi16 (_mm_sub_epi16 (FirstP0, FirstP2), _mm_slli_epi16 (_mm_sub_epi16 (SecondP0, SecondP2), 1 )), _mm_sub_epi16 (ThirdP0, ThirdP2)));
193
+ __m128i GY16 = _mm_abs_epi16 (_mm_sub_epi16 (_mm_add_epi16 (_mm_add_epi16 (FirstP0, FirstP2), _mm_slli_epi16 (_mm_sub_epi16 (FirstP1, ThirdP1), 1 )), _mm_add_epi16 (ThirdP0, ThirdP2)));
194
+
195
+ __m128i GXYL = _mm_unpacklo_epi16 (GX16, GY16);
196
+ __m128i GXYH = _mm_unpackhi_epi16 (GX16, GY16);
197
+
198
+ __m128i ResultL = _mm_cvtps_epi32 (_mm_sqrt_ps (_mm_cvtepi32_ps (_mm_madd_epi16 (GXYL, GXYL))));
199
+ __m128i ResultH = _mm_cvtps_epi32 (_mm_sqrt_ps (_mm_cvtepi32_ps (_mm_madd_epi16 (GXYH, GXYH))));
200
+
201
+ /* __m128i GX32L = _mm_unpacklo_epi16(GX16, Zero);
202
+ __m128i GX32H = _mm_unpackhi_epi16(GX16, Zero);
203
+ __m128i GY32L = _mm_unpacklo_epi16(GY16, Zero);
204
+ __m128i GY32H = _mm_unpackhi_epi16(GY16, Zero);
205
+
206
+ __m128i ResultL = _mm_cvtps_epi32(_mm_sqrt_ps(_mm_cvtepi32_ps(_mm_add_epi32(_mm_mullo_epi32(GX32L, GX32L), _mm_mullo_epi32(GY32L, GY32L)))));
207
+ __m128i ResultH = _mm_cvtps_epi32(_mm_sqrt_ps(_mm_cvtepi32_ps(_mm_add_epi32(_mm_mullo_epi32(GX32H, GX32H), _mm_mullo_epi32(GY32H, GY32H)))));
208
+
209
+ _mm_storel_epi64((__m128i *)(LinePD + X), _mm_packus_epi16(_mm_packus_epi32(ResultL, ResultH), Zero));*/
210
+ _mm_storel_epi64 ((__m128i *)(LinePD + X), _mm_packus_epi16 (_mm_packus_epi32 (ResultL, ResultH), Zero));
211
+ }
212
+
213
+ for (int X = Block * BlockSize; X < Width * 3 ; X++)
214
+ {
215
+ int GX = First[X] - First[X + 6 ] + (Second[X] - Second[X + 6 ]) * 2 + Third[X] - Third[X + 6 ];
216
+ int GY = First[X] + First[X + 6 ] + (First[X + 3 ] - Third[X + 3 ]) * 2 - Third[X] - Third[X + 6 ];
217
+ LinePD[X] = IM_ClampToByte (sqrtf (GX * GX + GY * GY + 0 .0F ));
218
+ }
219
+ }
220
+ }
221
+ free (RowCopy);
222
+ }
223
+
224
+ int main () {
225
+ Mat src = imread (" F:\\ car.jpg" );
226
+ int Height = src.rows ;
227
+ int Width = src.cols ;
228
+ unsigned char *Src = src.data ;
229
+ unsigned char *Dest = new unsigned char [Height * Width * 3 ];
230
+ int Stride = Width * 3 ;
231
+ int Radius = 11 ;
232
+ int Adjustment = 50 ;
233
+ int64 st = cvGetTickCount ();
234
+ for (int i = 0 ; i <50 ; i++) {
235
+ Sobel_SSE (Src, Dest, Width, Height, Stride);
236
+ }
237
+ double duration = (cv::getTickCount () - st) / cv::getTickFrequency () * 20 ;
238
+ printf (" %.5f\n " , duration);
239
+ Sobel_SSE (Src, Dest, Width, Height, Stride);
240
+ Mat dst (Height, Width, CV_8UC3, Dest);
241
+ imshow (" origin" , src);
242
+ imshow (" result" , dst);
243
+ imwrite (" F:\\ res.jpg" , dst);
244
+ waitKey (0 );
245
+ waitKey (0 );
246
+ }
0 commit comments