Leptonica 1.68
C Image Processing Library
|
00001 /*====================================================================* 00002 - Copyright (C) 2001 Leptonica. All rights reserved. 00003 - This software is distributed in the hope that it will be 00004 - useful, but with NO WARRANTY OF ANY KIND. 00005 - No author or distributor accepts responsibility to anyone for the 00006 - consequences of using this software, or for whether it serves any 00007 - particular purpose or works at all, unless he or she says so in 00008 - writing. Everyone is granted permission to copy, modify and 00009 - redistribute this source code, for commercial or non-commercial 00010 - purposes, with the following restrictions: (1) the origin of this 00011 - source code must not be misrepresented; (2) modified versions must 00012 - be plainly marked as such; and (3) this notice may not be removed 00013 - or altered from any source or modified source distribution. 00014 *====================================================================*/ 00015 00016 /* 00017 * numafunc2.c 00018 * 00019 * Morphological (min/max) operations 00020 * NUMA *numaErode() 00021 * NUMA *numaDilate() 00022 * NUMA *numaOpen() 00023 * NUMA *numaClose() 00024 * 00025 * Other transforms 00026 * NUMA *numaTransform() 00027 * l_int32 numaWindowedStats() 00028 * NUMA *numaWindowedMean() 00029 * NUMA *numaWindowedMeanSquare() 00030 * l_int32 numaWindowedVariance() 00031 * NUMA *numaConvertToInt() 00032 * 00033 * Histogram generation and statistics 00034 * NUMA *numaMakeHistogram() 00035 * NUMA *numaMakeHistogramAuto() 00036 * NUMA *numaMakeHistogramClipped() 00037 * NUMA *numaRebinHistogram() 00038 * NUMA *numaNormalizeHistogram() 00039 * l_int32 numaGetStatsUsingHistogram() 00040 * l_int32 numaGetHistogramStats() 00041 * l_int32 numaGetHistogramStatsOnInterval() 00042 * l_int32 numaMakeRankFromHistogram() 00043 * l_int32 numaHistogramGetRankFromVal() 00044 * l_int32 numaHistogramGetValFromRank() 00045 * l_int32 numaDiscretizeRankAndIntensity() 00046 * l_int32 numaGetRankBinValues() 00047 * 00048 * Splitting a distribution 00049 * l_int32 numaSplitDistribution() 00050 * 00051 * Extrema finding 00052 * NUMA *numaFindPeaks() 00053 * NUMA *numaFindExtrema() 00054 * l_int32 *numaCountReversals() 00055 * 00056 * Threshold crossings and frequency analysis 00057 * l_int32 numaSelectCrossingThreshold() 00058 * NUMA *numaCrossingsByThreshold() 00059 * NUMA *numaCrossingsByPeaks() 00060 * NUMA *numaEvalBestHaarParameters() 00061 * l_int32 numaEvalHaarSum() 00062 * 00063 * Things to remember when using the Numa: 00064 * 00065 * (1) The numa is a struct, not an array. Always use accessors 00066 * (see numabasic.c), never the fields directly. 00067 * 00068 * (2) The number array holds l_float32 values. It can also 00069 * be used to store l_int32 values. See numabasic.c for 00070 * details on using the accessors. 00071 * 00072 * (3) Occasionally, in the comments we denote the i-th element of a 00073 * numa by na[i]. This is conceptual only -- the numa is not an array! 00074 * 00075 * Some general comments on histograms: 00076 * 00077 * (1) Histograms are the generic statistical representation of 00078 * the data about some attribute. Typically they're not 00079 * normalized -- they simply give the number of occurrences 00080 * within each range of values of the attribute. This range 00081 * of values is referred to as a 'bucket'. For example, 00082 * the histogram could specify how many connected components 00083 * are found for each value of their width; in that case, 00084 * the bucket size is 1. 00085 * 00086 * (2) In leptonica, all buckets have the same size. Histograms 00087 * are therefore specified by a numa of occurrences, along 00088 * with two other numbers: the 'value' associated with the 00089 * occupants of the first bucket and the size (i.e., 'width') 00090 * of each bucket. These two numbers then allow us to calculate 00091 * the value associated with the occupants of each bucket. 00092 * These numbers are fields in the numa, initialized to 00093 * a startx value of 0.0 and a binsize of 1.0. Accessors for 00094 * these fields are functions numa*XParameters(). All histograms 00095 * must have these two numbers properly set. 00096 */ 00097 00098 #include <math.h> 00099 #include "allheaders.h" 00100 00101 /* bin sizes in numaMakeHistogram() */ 00102 static const l_int32 BinSizeArray[] = {2, 5, 10, 20, 50, 100, 200, 500, 1000,\ 00103 2000, 5000, 10000, 20000, 50000, 100000, 200000,\ 00104 500000, 1000000, 2000000, 5000000, 10000000,\ 00105 200000000, 50000000, 100000000}; 00106 static const l_int32 NBinSizes = 24; 00107 00108 00109 #ifndef NO_CONSOLE_IO 00110 #define DEBUG_HISTO 0 00111 #define DEBUG_CROSSINGS 0 00112 #define DEBUG_FREQUENCY 0 00113 #endif /* ~NO_CONSOLE_IO */ 00114 00115 00116 /*----------------------------------------------------------------------* 00117 * Morphological operations * 00118 *----------------------------------------------------------------------*/ 00119 /*! 00120 * numaErode() 00121 * 00122 * Input: nas 00123 * size (of sel; greater than 0, odd; origin implicitly in center) 00124 * Return: nad (eroded), or null on error 00125 * 00126 * Notes: 00127 * (1) The structuring element (sel) is linear, all "hits" 00128 * (2) If size == 1, this returns a copy 00129 * (3) General comment. The morphological operations are equivalent 00130 * to those that would be performed on a 1-dimensional fpix. 00131 * However, because we have not implemented morphological 00132 * operations on fpix, we do this here. Because it is only 00133 * 1 dimensional, there is no reason to use the more 00134 * complicated van Herk/Gil-Werman algorithm, and we do it 00135 * by brute force. 00136 */ 00137 NUMA * 00138 numaErode(NUMA *nas, 00139 l_int32 size) 00140 { 00141 l_int32 i, j, n, hsize, len; 00142 l_float32 minval; 00143 l_float32 *fa, *fas, *fad; 00144 NUMA *nad; 00145 00146 PROCNAME("numaErode"); 00147 00148 if (!nas) 00149 return (NUMA *)ERROR_PTR("nas not defined", procName, NULL); 00150 if (size <= 0) 00151 return (NUMA *)ERROR_PTR("size must be > 0", procName, NULL); 00152 if ((size & 1) == 0 ) { 00153 L_WARNING("sel size must be odd; increasing by 1", procName); 00154 size++; 00155 } 00156 00157 if (size == 1) 00158 return numaCopy(nas); 00159 00160 /* Make a source fa (fas) that has an added (size / 2) boundary 00161 * on left and right, contains a copy of nas in the interior region 00162 * (between 'size' and 'size + n', and has large values 00163 * inserted in the boundary (because it is an erosion). */ 00164 n = numaGetCount(nas); 00165 hsize = size / 2; 00166 len = n + 2 * hsize; 00167 if ((fas = (l_float32 *)CALLOC(len, sizeof(l_float32))) == NULL) 00168 return (NUMA *)ERROR_PTR("fas not made", procName, NULL); 00169 for (i = 0; i < hsize; i++) 00170 fas[i] = 1.0e37; 00171 for (i = hsize + n; i < len; i++) 00172 fas[i] = 1.0e37; 00173 fa = numaGetFArray(nas, L_NOCOPY); 00174 for (i = 0; i < n; i++) 00175 fas[hsize + i] = fa[i]; 00176 00177 nad = numaMakeConstant(0, n); 00178 numaCopyXParameters(nad, nas); 00179 fad = numaGetFArray(nad, L_NOCOPY); 00180 for (i = 0; i < n; i++) { 00181 minval = 1.0e37; /* start big */ 00182 for (j = 0; j < size; j++) 00183 minval = L_MIN(minval, fas[i + j]); 00184 fad[i] = minval; 00185 } 00186 00187 FREE(fas); 00188 return nad; 00189 } 00190 00191 00192 /*! 00193 * numaDilate() 00194 * 00195 * Input: nas 00196 * size (of sel; greater than 0, odd; origin implicitly in center) 00197 * Return: nad (dilated), or null on error 00198 * 00199 * Notes: 00200 * (1) The structuring element (sel) is linear, all "hits" 00201 * (2) If size == 1, this returns a copy 00202 */ 00203 NUMA * 00204 numaDilate(NUMA *nas, 00205 l_int32 size) 00206 { 00207 l_int32 i, j, n, hsize, len; 00208 l_float32 maxval; 00209 l_float32 *fa, *fas, *fad; 00210 NUMA *nad; 00211 00212 PROCNAME("numaDilate"); 00213 00214 if (!nas) 00215 return (NUMA *)ERROR_PTR("nas not defined", procName, NULL); 00216 if (size <= 0) 00217 return (NUMA *)ERROR_PTR("size must be > 0", procName, NULL); 00218 if ((size & 1) == 0 ) { 00219 L_WARNING("sel size must be odd; increasing by 1", procName); 00220 size++; 00221 } 00222 00223 if (size == 1) 00224 return numaCopy(nas); 00225 00226 /* Make a source fa (fas) that has an added (size / 2) boundary 00227 * on left and right, contains a copy of nas in the interior region 00228 * (between 'size' and 'size + n', and has small values 00229 * inserted in the boundary (because it is a dilation). */ 00230 n = numaGetCount(nas); 00231 hsize = size / 2; 00232 len = n + 2 * hsize; 00233 if ((fas = (l_float32 *)CALLOC(len, sizeof(l_float32))) == NULL) 00234 return (NUMA *)ERROR_PTR("fas not made", procName, NULL); 00235 for (i = 0; i < hsize; i++) 00236 fas[i] = -1.0e37; 00237 for (i = hsize + n; i < len; i++) 00238 fas[i] = -1.0e37; 00239 fa = numaGetFArray(nas, L_NOCOPY); 00240 for (i = 0; i < n; i++) 00241 fas[hsize + i] = fa[i]; 00242 00243 nad = numaMakeConstant(0, n); 00244 numaCopyXParameters(nad, nas); 00245 fad = numaGetFArray(nad, L_NOCOPY); 00246 for (i = 0; i < n; i++) { 00247 maxval = -1.0e37; /* start small */ 00248 for (j = 0; j < size; j++) 00249 maxval = L_MAX(maxval, fas[i + j]); 00250 fad[i] = maxval; 00251 } 00252 00253 FREE(fas); 00254 return nad; 00255 } 00256 00257 00258 /*! 00259 * numaOpen() 00260 * 00261 * Input: nas 00262 * size (of sel; greater than 0, odd; origin implicitly in center) 00263 * Return: nad (opened), or null on error 00264 * 00265 * Notes: 00266 * (1) The structuring element (sel) is linear, all "hits" 00267 * (2) If size == 1, this returns a copy 00268 */ 00269 NUMA * 00270 numaOpen(NUMA *nas, 00271 l_int32 size) 00272 { 00273 NUMA *nat, *nad; 00274 00275 PROCNAME("numaOpen"); 00276 00277 if (!nas) 00278 return (NUMA *)ERROR_PTR("nas not defined", procName, NULL); 00279 if (size <= 0) 00280 return (NUMA *)ERROR_PTR("size must be > 0", procName, NULL); 00281 if ((size & 1) == 0 ) { 00282 L_WARNING("sel size must be odd; increasing by 1", procName); 00283 size++; 00284 } 00285 00286 if (size == 1) 00287 return numaCopy(nas); 00288 00289 nat = numaErode(nas, size); 00290 nad = numaDilate(nat, size); 00291 numaDestroy(&nat); 00292 return nad; 00293 } 00294 00295 00296 /*! 00297 * numaClose() 00298 * 00299 * Input: nas 00300 * size (of sel; greater than 0, odd; origin implicitly in center) 00301 * Return: nad (opened), or null on error 00302 * 00303 * Notes: 00304 * (1) The structuring element (sel) is linear, all "hits" 00305 * (2) If size == 1, this returns a copy 00306 * (3) We add a border before doing this operation, for the same 00307 * reason that we add a border to a pix before doing a safe closing. 00308 * Without the border, a small component near the border gets 00309 * clipped at the border on dilation, and can be entirely removed 00310 * by the following erosion, violating the basic extensivity 00311 * property of closing. 00312 */ 00313 NUMA * 00314 numaClose(NUMA *nas, 00315 l_int32 size) 00316 { 00317 NUMA *nab, *nat1, *nat2, *nad; 00318 00319 PROCNAME("numaClose"); 00320 00321 if (!nas) 00322 return (NUMA *)ERROR_PTR("nas not defined", procName, NULL); 00323 if (size <= 0) 00324 return (NUMA *)ERROR_PTR("size must be > 0", procName, NULL); 00325 if ((size & 1) == 0 ) { 00326 L_WARNING("sel size must be odd; increasing by 1", procName); 00327 size++; 00328 } 00329 00330 if (size == 1) 00331 return numaCopy(nas); 00332 00333 nab = numaAddBorder(nas, size, size, 0); /* to preserve extensivity */ 00334 nat1 = numaDilate(nab, size); 00335 nat2 = numaErode(nat1, size); 00336 nad = numaRemoveBorder(nat2, size, size); 00337 numaDestroy(&nab); 00338 numaDestroy(&nat1); 00339 numaDestroy(&nat2); 00340 return nad; 00341 } 00342 00343 00344 /*----------------------------------------------------------------------* 00345 * Other transforms * 00346 *----------------------------------------------------------------------*/ 00347 /*! 00348 * numaTransform() 00349 * 00350 * Input: nas 00351 * shift (add this to each number) 00352 * scale (multiply each number by this) 00353 * Return: nad (with all values shifted and scaled, or null on error) 00354 * 00355 * Notes: 00356 * (1) Each number is shifted before scaling. 00357 * (2) The operation sequence is opposite to that for Box and Pta: 00358 * scale first, then shift. 00359 */ 00360 NUMA * 00361 numaTransform(NUMA *nas, 00362 l_float32 shift, 00363 l_float32 scale) 00364 { 00365 l_int32 i, n; 00366 l_float32 val; 00367 NUMA *nad; 00368 00369 PROCNAME("numaTransform"); 00370 00371 if (!nas) 00372 return (NUMA *)ERROR_PTR("nas not defined", procName, NULL); 00373 n = numaGetCount(nas); 00374 if ((nad = numaCreate(n)) == NULL) 00375 return (NUMA *)ERROR_PTR("nad not made", procName, NULL); 00376 for (i = 0; i < n; i++) { 00377 numaGetFValue(nas, i, &val); 00378 val = scale * val + shift; 00379 numaAddNumber(nad, val); 00380 } 00381 return nad; 00382 } 00383 00384 00385 /*! 00386 * numaWindowedStats() 00387 * 00388 * Input: nas (input numa) 00389 * wc (half width of the window) 00390 * &nam (<optional return> mean value in window) 00391 * &nams (<optional return> mean square value in window) 00392 * &pnav (<optional return> variance in window) 00393 * &pnarv (<optional return> rms deviation from the mean) 00394 * Return: 0 if OK, 1 on error 00395 * 00396 * Notes: 00397 * (1) This is a high-level convenience function for calculating 00398 * any or all of these derived arrays. 00399 * (2) These statistical measures over the values in the 00400 * rectangular window are: 00401 * - average value: <x> (nam) 00402 * - average squared value: <x*x> (nams) 00403 * - variance: <(x - <x>)*(x - <x>)> = <x*x> - <x>*<x> (nav) 00404 * - square-root of variance: (narv) 00405 * where the brackets < .. > indicate that the average value is 00406 * to be taken over the window. 00407 * (3) Note that the variance is just the mean square difference from 00408 * the mean value; and the square root of the variance is the 00409 * root mean square difference from the mean, sometimes also 00410 * called the 'standard deviation'. 00411 * (4) Internally, use mirrored borders to handle values near the 00412 * end of each array. 00413 */ 00414 l_int32 00415 numaWindowedStats(NUMA *nas, 00416 l_int32 wc, 00417 NUMA **pnam, 00418 NUMA **pnams, 00419 NUMA **pnav, 00420 NUMA **pnarv) 00421 { 00422 NUMA *nam, *nams; 00423 00424 PROCNAME("numaWindowedStats"); 00425 00426 if (!nas) 00427 return ERROR_INT("nas not defined", procName, 1); 00428 if (2 * wc + 1 > numaGetCount(nas)) 00429 L_WARNING("filter wider than input array!", procName); 00430 00431 if (!pnav && !pnarv) { 00432 if (pnam) *pnam = numaWindowedMean(nas, wc); 00433 if (pnams) *pnams = numaWindowedMeanSquare(nas, wc); 00434 return 0; 00435 } 00436 00437 nam = numaWindowedMean(nas, wc); 00438 nams = numaWindowedMeanSquare(nas, wc); 00439 numaWindowedVariance(nam, nams, pnav, pnarv); 00440 if (pnam) 00441 *pnam = nam; 00442 else 00443 numaDestroy(&nam); 00444 if (pnams) 00445 *pnams = nams; 00446 else 00447 numaDestroy(&nams); 00448 return 0; 00449 } 00450 00451 00452 /*! 00453 * numaWindowedMean() 00454 * 00455 * Input: nas 00456 * wc (half width of the convolution window) 00457 * Return: nad (after low-pass filtering), or null on error 00458 * 00459 * Notes: 00460 * (1) This is a convolution. The window has width = 2 * @wc + 1. 00461 * (2) We add a mirrored border of size @wc to each end of the array. 00462 */ 00463 NUMA * 00464 numaWindowedMean(NUMA *nas, 00465 l_int32 wc) 00466 { 00467 l_int32 i, n, n1, width; 00468 l_float32 sum, norm; 00469 l_float32 *fa1, *fad, *suma; 00470 NUMA *na1, *nad; 00471 00472 PROCNAME("numaWindowedMean"); 00473 00474 if (!nas) 00475 return (NUMA *)ERROR_PTR("nas not defined", procName, NULL); 00476 n = numaGetCount(nas); 00477 width = 2 * wc + 1; /* filter width */ 00478 if (width > n) 00479 L_WARNING("filter wider than input array!", procName); 00480 00481 na1 = numaAddSpecifiedBorder(nas, wc, wc, L_MIRRORED_BORDER); 00482 n1 = n + 2 * wc; 00483 fa1 = numaGetFArray(na1, L_NOCOPY); 00484 nad = numaMakeConstant(0, n); 00485 fad = numaGetFArray(nad, L_NOCOPY); 00486 00487 /* Make sum array; note the indexing */ 00488 if ((suma = (l_float32 *)CALLOC(n1 + 1, sizeof(l_float32))) == NULL) 00489 return (NUMA *)ERROR_PTR("suma not made", procName, NULL); 00490 sum = 0.0; 00491 suma[0] = 0.0; 00492 for (i = 0; i < n1; i++) { 00493 sum += fa1[i]; 00494 suma[i + 1] = sum; 00495 } 00496 00497 norm = 1. / (2 * wc + 1); 00498 for (i = 0; i < n; i++) 00499 fad[i] = norm * (suma[width + i] - suma[i]); 00500 00501 FREE(suma); 00502 numaDestroy(&na1); 00503 return nad; 00504 } 00505 00506 00507 /*! 00508 * numaWindowedMeanSquare() 00509 * 00510 * Input: nas 00511 * wc (half width of the window) 00512 * Return: nad (containing windowed mean square values), or null on error 00513 * 00514 * Notes: 00515 * (1) The window has width = 2 * @wc + 1. 00516 * (2) We add a mirrored border of size @wc to each end of the array. 00517 */ 00518 NUMA * 00519 numaWindowedMeanSquare(NUMA *nas, 00520 l_int32 wc) 00521 { 00522 l_int32 i, n, n1, width; 00523 l_float32 sum, norm; 00524 l_float32 *fa1, *fad, *suma; 00525 NUMA *na1, *nad; 00526 00527 PROCNAME("numaWindowedMeanSquare"); 00528 00529 if (!nas) 00530 return (NUMA *)ERROR_PTR("nas not defined", procName, NULL); 00531 n = numaGetCount(nas); 00532 width = 2 * wc + 1; /* filter width */ 00533 if (width > n) 00534 L_WARNING("filter wider than input array!", procName); 00535 00536 na1 = numaAddSpecifiedBorder(nas, wc, wc, L_MIRRORED_BORDER); 00537 n1 = n + 2 * wc; 00538 fa1 = numaGetFArray(na1, L_NOCOPY); 00539 nad = numaMakeConstant(0, n); 00540 fad = numaGetFArray(nad, L_NOCOPY); 00541 00542 /* Make sum array; note the indexing */ 00543 if ((suma = (l_float32 *)CALLOC(n1 + 1, sizeof(l_float32))) == NULL) 00544 return (NUMA *)ERROR_PTR("suma not made", procName, NULL); 00545 sum = 0.0; 00546 suma[0] = 0.0; 00547 for (i = 0; i < n1; i++) { 00548 sum += fa1[i] * fa1[i]; 00549 suma[i + 1] = sum; 00550 } 00551 00552 norm = 1. / (2 * wc + 1); 00553 for (i = 0; i < n; i++) 00554 fad[i] = norm * (suma[width + i] - suma[i]); 00555 00556 FREE(suma); 00557 numaDestroy(&na1); 00558 return nad; 00559 } 00560 00561 00562 /*! 00563 * numaWindowedVariance() 00564 * 00565 * Input: nam (windowed mean values) 00566 * nams (windowed mean square values) 00567 * &pnav (<optional return> numa of variance -- the ms deviation 00568 * from the mean) 00569 * &pnarv (<optional return> numa of rms deviation from the mean) 00570 * Return: 0 if OK, 1 on error 00571 * 00572 * Notes: 00573 * (1) The numas of windowed mean and mean square are precomputed, 00574 * using numaWindowedMean() and numaWindowedMeanSquare(). 00575 * (2) Either or both of the variance and square-root of variance 00576 * are returned, where the variance is the average over the 00577 * window of the mean square difference of the pixel value 00578 * from the mean: 00579 * <(x - <x>)*(x - <x>)> = <x*x> - <x>*<x> 00580 */ 00581 l_int32 00582 numaWindowedVariance(NUMA *nam, 00583 NUMA *nams, 00584 NUMA **pnav, 00585 NUMA **pnarv) 00586 { 00587 l_int32 i, nm, nms; 00588 l_float32 var; 00589 l_float32 *fam, *fams, *fav, *farv; 00590 NUMA *nav, *narv; /* variance and square root of variance */ 00591 00592 PROCNAME("numaWindowedVariance"); 00593 00594 if (!nam) 00595 return ERROR_INT("nam not defined", procName, 1); 00596 if (!nams) 00597 return ERROR_INT("nams not defined", procName, 1); 00598 if (!pnav && !pnarv) 00599 return ERROR_INT("neither &nav nor &narv are defined", procName, 1); 00600 nm = numaGetCount(nam); 00601 nms = numaGetCount(nams); 00602 if (nm != nms) 00603 return ERROR_INT("sizes of nam and nams differ", procName, 1); 00604 00605 if (pnav) { 00606 nav = numaMakeConstant(0, nm); 00607 *pnav = nav; 00608 fav = numaGetFArray(nav, L_NOCOPY); 00609 } 00610 if (pnarv) { 00611 narv = numaMakeConstant(0, nm); 00612 *pnarv = narv; 00613 farv = numaGetFArray(narv, L_NOCOPY); 00614 } 00615 fam = numaGetFArray(nam, L_NOCOPY); 00616 fams = numaGetFArray(nams, L_NOCOPY); 00617 00618 for (i = 0; i < nm; i++) { 00619 var = fams[i] - fam[i] * fam[i]; 00620 if (pnav) 00621 fav[i] = var; 00622 if (pnarv) 00623 farv[i] = (l_float32)sqrt(var); 00624 } 00625 00626 return 0; 00627 } 00628 00629 00630 /*! 00631 * numaConvertToInt() 00632 * 00633 * Input: na 00634 * Return: na with all values rounded to nearest integer, or 00635 * null on error 00636 */ 00637 NUMA * 00638 numaConvertToInt(NUMA *nas) 00639 { 00640 l_int32 i, n, ival; 00641 NUMA *nad; 00642 00643 PROCNAME("numaConvertToInt"); 00644 00645 if (!nas) 00646 return (NUMA *)ERROR_PTR("nas not defined", procName, NULL); 00647 00648 n = numaGetCount(nas); 00649 if ((nad = numaCreate(n)) == NULL) 00650 return (NUMA *)ERROR_PTR("nad not made", procName, NULL); 00651 for (i = 0; i < n; i++) { 00652 numaGetIValue(nas, i, &ival); 00653 numaAddNumber(nad, ival); 00654 } 00655 return nad; 00656 } 00657 00658 00659 /*----------------------------------------------------------------------* 00660 * Histogram generation and statistics * 00661 *----------------------------------------------------------------------*/ 00662 /*! 00663 * numaMakeHistogram() 00664 * 00665 * Input: na 00666 * maxbins (max number of histogram bins) 00667 * &binsize (<return> size of histogram bins) 00668 * &binstart (<optional return> start val of minimum bin; 00669 * input NULL to force start at 0) 00670 * Return: na consisiting of histogram of integerized values, 00671 * or null on error. 00672 * 00673 * Note: 00674 * (1) This simple interface is designed for integer data. 00675 * The bins are of integer width and start on integer boundaries, 00676 * so the results on float data will not have high precision. 00677 * (2) Specify the max number of input bins. Then @binsize, 00678 * the size of bins necessary to accommodate the input data, 00679 * is returned. It is one of the sequence: 00680 * {1, 2, 5, 10, 20, 50, ...}. 00681 * (3) If &binstart is given, all values are accommodated, 00682 * and the min value of the starting bin is returned. 00683 * Otherwise, all negative values are discarded and 00684 * the histogram bins start at 0. 00685 */ 00686 NUMA * 00687 numaMakeHistogram(NUMA *na, 00688 l_int32 maxbins, 00689 l_int32 *pbinsize, 00690 l_int32 *pbinstart) 00691 { 00692 l_int32 i, n, ival, hval; 00693 l_int32 iminval, imaxval, range, binsize, nbins, ibin; 00694 l_float32 val, ratio; 00695 NUMA *nai, *nahist; 00696 00697 PROCNAME("numaMakeHistogram"); 00698 00699 if (!na) 00700 return (NUMA *)ERROR_PTR("na not defined", procName, NULL); 00701 if (!pbinsize) 00702 return (NUMA *)ERROR_PTR("&binsize not defined", procName, NULL); 00703 00704 /* Determine input range */ 00705 numaGetMin(na, &val, NULL); 00706 iminval = (l_int32)(val + 0.5); 00707 numaGetMax(na, &val, NULL); 00708 imaxval = (l_int32)(val + 0.5); 00709 if (pbinstart == NULL) { /* clip negative vals; start from 0 */ 00710 iminval = 0; 00711 if (imaxval < 0) 00712 return (NUMA *)ERROR_PTR("all values < 0", procName, NULL); 00713 } 00714 00715 /* Determine binsize */ 00716 range = imaxval - iminval + 1; 00717 if (range > maxbins - 1) { 00718 ratio = (l_float64)range / (l_float64)maxbins; 00719 binsize = 0; 00720 for (i = 0; i < NBinSizes; i++) { 00721 if (ratio < BinSizeArray[i]) { 00722 binsize = BinSizeArray[i]; 00723 break; 00724 } 00725 } 00726 if (binsize == 0) 00727 return (NUMA *)ERROR_PTR("numbers too large", procName, NULL); 00728 } 00729 else 00730 binsize = 1; 00731 *pbinsize = binsize; 00732 nbins = 1 + range / binsize; /* +1 seems to be sufficient */ 00733 00734 /* Redetermine iminval */ 00735 if (pbinstart && binsize > 1) { 00736 if (iminval >= 0) 00737 iminval = binsize * (iminval / binsize); 00738 else 00739 iminval = binsize * ((iminval - binsize + 1) / binsize); 00740 } 00741 if (pbinstart) 00742 *pbinstart = iminval; 00743 00744 #if DEBUG_HISTO 00745 fprintf(stderr, " imaxval = %d, range = %d, nbins = %d\n", 00746 imaxval, range, nbins); 00747 #endif /* DEBUG_HISTO */ 00748 00749 /* Use integerized data for input */ 00750 if ((nai = numaConvertToInt(na)) == NULL) 00751 return (NUMA *)ERROR_PTR("nai not made", procName, NULL); 00752 n = numaGetCount(nai); 00753 00754 /* Make histogram, converting value in input array 00755 * into a bin number for this histogram array. */ 00756 if ((nahist = numaCreate(nbins)) == NULL) 00757 return (NUMA *)ERROR_PTR("nahist not made", procName, NULL); 00758 numaSetCount(nahist, nbins); 00759 numaSetXParameters(nahist, iminval, binsize); 00760 for (i = 0; i < n; i++) { 00761 numaGetIValue(nai, i, &ival); 00762 ibin = (ival - iminval) / binsize; 00763 if (ibin >= 0 && ibin < nbins) { 00764 numaGetIValue(nahist, ibin, &hval); 00765 numaSetValue(nahist, ibin, hval + 1.0); 00766 } 00767 } 00768 00769 numaDestroy(&nai); 00770 return nahist; 00771 } 00772 00773 00774 /*! 00775 * numaMakeHistogramAuto() 00776 * 00777 * Input: na (numa of floats; these may be integers) 00778 * maxbins (max number of histogram bins; >= 1) 00779 * Return: na consisiting of histogram of quantized float values, 00780 * or null on error. 00781 * 00782 * Notes: 00783 * (1) This simple interface is designed for accurate binning 00784 * of both integer and float data. 00785 * (2) If the array data is integers, and the range of integers 00786 * is smaller than @maxbins, they are binned as they fall, 00787 * with binsize = 1. 00788 * (3) If the range of data, (maxval - minval), is larger than 00789 * @maxbins, or if the data is floats, they are binned into 00790 * exactly @maxbins bins. 00791 * (4) Unlike numaMakeHistogram(), these bins in general have 00792 * non-integer location and width, even for integer data. 00793 */ 00794 NUMA * 00795 numaMakeHistogramAuto(NUMA *na, 00796 l_int32 maxbins) 00797 { 00798 l_int32 i, n, imin, imax, irange, ibin, ival, allints; 00799 l_float32 minval, maxval, range, binsize, fval; 00800 NUMA *nah; 00801 00802 PROCNAME("numaMakeHistogramAuto"); 00803 00804 if (!na) 00805 return (NUMA *)ERROR_PTR("na not defined", procName, NULL); 00806 maxbins = L_MAX(1, maxbins); 00807 00808 /* Determine input range */ 00809 numaGetMin(na, &minval, NULL); 00810 numaGetMax(na, &maxval, NULL); 00811 00812 /* Determine if values are all integers */ 00813 n = numaGetCount(na); 00814 numaHasOnlyIntegers(na, maxbins, &allints); 00815 00816 /* Do simple integer binning if possible */ 00817 if (allints && (maxval - minval < maxbins)) { 00818 imin = (l_int32)minval; 00819 imax = (l_int32)maxval; 00820 irange = imax - imin + 1; 00821 nah = numaCreate(irange); 00822 numaSetCount(nah, irange); /* init */ 00823 numaSetXParameters(nah, minval, 1.0); 00824 for (i = 0; i < n; i++) { 00825 numaGetIValue(na, i, &ival); 00826 ibin = ival - imin; 00827 numaGetIValue(nah, ibin, &ival); 00828 numaSetValue(nah, ibin, ival + 1.0); 00829 } 00830 00831 return nah; 00832 } 00833 00834 /* Do float binning, even if the data is integers. */ 00835 range = maxval - minval; 00836 binsize = range / (l_float32)maxbins; 00837 if (range == 0.0) { 00838 nah = numaCreate(1); 00839 numaSetXParameters(nah, minval, binsize); 00840 numaAddNumber(nah, n); 00841 return nah; 00842 } 00843 nah = numaCreate(maxbins); 00844 numaSetCount(nah, maxbins); 00845 numaSetXParameters(nah, minval, binsize); 00846 for (i = 0; i < n; i++) { 00847 numaGetFValue(na, i, &fval); 00848 ibin = (l_int32)((fval - minval) / binsize); 00849 ibin = L_MIN(ibin, maxbins - 1); /* "edge" case; stay in bounds */ 00850 numaGetIValue(nah, ibin, &ival); 00851 numaSetValue(nah, ibin, ival + 1.0); 00852 } 00853 00854 return nah; 00855 } 00856 00857 00858 /*! 00859 * numaMakeHistogramClipped() 00860 * 00861 * Input: na 00862 * binsize (typically 1.0) 00863 * maxsize (of histogram ordinate) 00864 * Return: na (histogram of bins of size @binsize, starting with 00865 * the na[0] (x = 0.0) and going up to a maximum of 00866 * x = @maxsize, by increments of @binsize), or null on error 00867 * 00868 * Notes: 00869 * (1) This simple function generates a histogram of values 00870 * from na, discarding all values < 0.0 or greater than 00871 * min(@maxsize, maxval), where maxval is the maximum value in na. 00872 * The histogram data is put in bins of size delx = @binsize, 00873 * starting at x = 0.0. We use as many bins as are 00874 * needed to hold the data. 00875 */ 00876 NUMA * 00877 numaMakeHistogramClipped(NUMA *na, 00878 l_float32 binsize, 00879 l_float32 maxsize) 00880 { 00881 l_int32 i, n, nbins, ival, ibin; 00882 l_float32 val, maxval; 00883 NUMA *nad; 00884 00885 PROCNAME("numaMakeHistogramClipped"); 00886 00887 if (!na) 00888 return (NUMA *)ERROR_PTR("na not defined", procName, NULL); 00889 if (binsize <= 0.0) 00890 return (NUMA *)ERROR_PTR("binsize must be > 0.0", procName, NULL); 00891 if (binsize > maxsize) 00892 binsize = maxsize; /* just one bin */ 00893 00894 numaGetMax(na, &maxval, NULL); 00895 n = numaGetCount(na); 00896 maxsize = L_MIN(maxsize, maxval); 00897 nbins = (l_int32)(maxsize / binsize) + 1; 00898 00899 /* fprintf(stderr, "maxsize = %7.3f, nbins = %d\n", maxsize, nbins); */ 00900 00901 if ((nad = numaCreate(nbins)) == NULL) 00902 return (NUMA *)ERROR_PTR("nad not made", procName, NULL); 00903 numaSetXParameters(nad, 0.0, binsize); 00904 numaSetCount(nad, nbins); /* interpret zeroes in bins as data */ 00905 for (i = 0; i < n; i++) { 00906 numaGetFValue(na, i, &val); 00907 ibin = (l_int32)(val / binsize); 00908 if (ibin >= 0 && ibin < nbins) { 00909 numaGetIValue(nad, ibin, &ival); 00910 numaSetValue(nad, ibin, ival + 1.0); 00911 } 00912 } 00913 00914 return nad; 00915 } 00916 00917 00918 /*! 00919 * numaRebinHistogram() 00920 * 00921 * Input: nas (input histogram) 00922 * newsize (number of old bins contained in each new bin) 00923 * Return: nad (more coarsely re-binned histogram), or null on error 00924 */ 00925 NUMA * 00926 numaRebinHistogram(NUMA *nas, 00927 l_int32 newsize) 00928 { 00929 l_int32 i, j, ns, nd, index, count, val; 00930 l_float32 start, oldsize; 00931 NUMA *nad; 00932 00933 PROCNAME("numaRebinHistogram"); 00934 00935 if (!nas) 00936 return (NUMA *)ERROR_PTR("nas not defined", procName, NULL); 00937 if (newsize <= 1) 00938 return (NUMA *)ERROR_PTR("newsize must be > 1", procName, NULL); 00939 if ((ns = numaGetCount(nas)) == 0) 00940 return (NUMA *)ERROR_PTR("no bins in nas", procName, NULL); 00941 00942 nd = (ns + newsize - 1) / newsize; 00943 if ((nad = numaCreate(nd)) == NULL) 00944 return (NUMA *)ERROR_PTR("nad not made", procName, NULL); 00945 numaGetXParameters(nad, &start, &oldsize); 00946 numaSetXParameters(nad, start, oldsize * newsize); 00947 00948 for (i = 0; i < nd; i++) { /* new bins */ 00949 count = 0; 00950 index = i * newsize; 00951 for (j = 0; j < newsize; j++) { 00952 if (index < ns) { 00953 numaGetIValue(nas, index, &val); 00954 count += val; 00955 index++; 00956 } 00957 } 00958 numaAddNumber(nad, count); 00959 } 00960 00961 return nad; 00962 } 00963 00964 00965 /*! 00966 * numaNormalizeHistogram() 00967 * 00968 * Input: nas (input histogram) 00969 * area (target sum of all numbers in dest histogram; 00970 * e.g., use area = 1.0 if this represents a 00971 * probability distribution) 00972 * Return: nad (normalized histogram), or null on error 00973 */ 00974 NUMA * 00975 numaNormalizeHistogram(NUMA *nas, 00976 l_float32 area) 00977 { 00978 l_int32 i, ns; 00979 l_float32 sum, factor, fval; 00980 NUMA *nad; 00981 00982 PROCNAME("numaNormalizeHistogram"); 00983 00984 if (!nas) 00985 return (NUMA *)ERROR_PTR("nas not defined", procName, NULL); 00986 if (area <= 0.0) 00987 return (NUMA *)ERROR_PTR("area must be > 0.0", procName, NULL); 00988 if ((ns = numaGetCount(nas)) == 0) 00989 return (NUMA *)ERROR_PTR("no bins in nas", procName, NULL); 00990 00991 numaGetSum(nas, &sum); 00992 factor = area / sum; 00993 00994 if ((nad = numaCreate(ns)) == NULL) 00995 return (NUMA *)ERROR_PTR("nad not made", procName, NULL); 00996 numaCopyXParameters(nad, nas); 00997 00998 for (i = 0; i < ns; i++) { 00999 numaGetFValue(nas, i, &fval); 01000 fval *= factor; 01001 numaAddNumber(nad, fval); 01002 } 01003 01004 return nad; 01005 } 01006 01007 01008 /*! 01009 * numaGetStatsUsingHistogram() 01010 * 01011 * Input: na (an arbitrary set of numbers; not ordered and not 01012 * a histogram) 01013 * maxbins (the maximum number of bins to be allowed in 01014 * the histogram; use 0 for consecutive integer bins) 01015 * &min (<optional return> min value of set) 01016 * &max (<optional return> max value of set) 01017 * &mean (<optional return> mean value of set) 01018 * &variance (<optional return> variance) 01019 * &median (<optional return> median value of set) 01020 * rank (in [0.0 ... 1.0]; median has a rank 0.5; ignored 01021 * if &rval == NULL) 01022 * &rval (<optional return> value in na corresponding to @rank) 01023 * &histo (<optional return> Numa histogram; use NULL to prevent) 01024 * Return: 0 if OK, 1 on error 01025 * 01026 * Notes: 01027 * (1) This is a simple interface for gathering statistics 01028 * from a numa, where a histogram is used 'under the covers' 01029 * to avoid sorting if a rank value is requested. In that case, 01030 * by using a histogram we are trading speed for accuracy, because 01031 * the values in @na are quantized to the center of a set of bins. 01032 * (2) If the median, other rank value, or histogram are not requested, 01033 * the calculation is all performed on the input Numa. 01034 * (3) The variance is the average of the square of the 01035 * difference from the mean. The median is the value in na 01036 * with rank 0.5. 01037 * (4) There are two situations where this gives rank results with 01038 * accuracy comparable to computing stastics directly on the input 01039 * data, without binning into a histogram: 01040 * (a) the data is integers and the range of data is less than 01041 * @maxbins, and 01042 * (b) the data is floats and the range is small compared to 01043 * @maxbins, so that the binsize is much less than 1. 01044 * (5) If a histogram is used and the numbers in the Numa extend 01045 * over a large range, you can limit the required storage by 01046 * specifying the maximum number of bins in the histogram. 01047 * Use @maxbins == 0 to force the bin size to be 1. 01048 * (6) This optionally returns the median and one arbitrary rank value. 01049 * If you need several rank values, return the histogram and use 01050 * numaHistogramGetValFromRank(nah, rank, &rval) 01051 * multiple times. 01052 */ 01053 l_int32 01054 numaGetStatsUsingHistogram(NUMA *na, 01055 l_int32 maxbins, 01056 l_float32 *pmin, 01057 l_float32 *pmax, 01058 l_float32 *pmean, 01059 l_float32 *pvariance, 01060 l_float32 *pmedian, 01061 l_float32 rank, 01062 l_float32 *prval, 01063 NUMA **phisto) 01064 { 01065 l_int32 i, n; 01066 l_float32 minval, maxval, fval, mean, sum; 01067 NUMA *nah; 01068 01069 PROCNAME("numaGetStatsUsingHistogram"); 01070 01071 if (pmin) *pmin = 0.0; 01072 if (pmax) *pmax = 0.0; 01073 if (pmean) *pmean = 0.0; 01074 if (pmedian) *pmedian = 0.0; 01075 if (pvariance) *pvariance = 0.0; 01076 if (!na) 01077 return ERROR_INT("na not defined", procName, 1); 01078 if ((n = numaGetCount(na)) == 0) 01079 return ERROR_INT("numa is empty", procName, 1); 01080 01081 numaGetMin(na, &minval, NULL); 01082 numaGetMax(na, &maxval, NULL); 01083 if (pmin) *pmin = minval; 01084 if (pmax) *pmax = maxval; 01085 if (pmean || pvariance) { 01086 sum = 0.0; 01087 for (i = 0; i < n; i++) { 01088 numaGetFValue(na, i, &fval); 01089 sum += fval; 01090 } 01091 mean = sum / (l_float32)n; 01092 if (pmean) *pmean = mean; 01093 } 01094 if (pvariance) { 01095 sum = 0.0; 01096 for (i = 0; i < n; i++) { 01097 numaGetFValue(na, i, &fval); 01098 sum += fval * fval; 01099 } 01100 *pvariance = sum / (l_float32)n - mean * mean; 01101 } 01102 01103 if (!pmedian && !prval && !phisto) 01104 return 0; 01105 01106 nah = numaMakeHistogramAuto(na, maxbins); 01107 if (pmedian) 01108 numaHistogramGetValFromRank(nah, 0.5, pmedian); 01109 if (prval) 01110 numaHistogramGetValFromRank(nah, rank, prval); 01111 if (phisto) 01112 *phisto = nah; 01113 else 01114 numaDestroy(&nah); 01115 return 0; 01116 } 01117 01118 01119 /*! 01120 * numaGetHistogramStats() 01121 * 01122 * Input: nahisto (histogram: y(x(i)), i = 0 ... nbins - 1) 01123 * startx (x value of first bin: x(0)) 01124 * deltax (x increment between bins; the bin size; x(1) - x(0)) 01125 * &xmean (<optional return> mean value of histogram) 01126 * &xmedian (<optional return> median value of histogram) 01127 * &xmode (<optional return> mode value of histogram: 01128 * xmode = x(imode), where y(xmode) >= y(x(i)) for 01129 * all i != imode) 01130 * &xvariance (<optional return> variance of x) 01131 * Return: 0 if OK, 1 on error 01132 * 01133 * Notes: 01134 * (1) If the histogram represents the relation y(x), the 01135 * computed values that are returned are the x values. 01136 * These are NOT the bucket indices i; they are related to the 01137 * bucket indices by 01138 * x(i) = startx + i * deltax 01139 */ 01140 l_int32 01141 numaGetHistogramStats(NUMA *nahisto, 01142 l_float32 startx, 01143 l_float32 deltax, 01144 l_float32 *pxmean, 01145 l_float32 *pxmedian, 01146 l_float32 *pxmode, 01147 l_float32 *pxvariance) 01148 { 01149 PROCNAME("numaGetHistogramStats"); 01150 01151 if (pxmean) *pxmean = 0.0; 01152 if (pxmedian) *pxmedian = 0.0; 01153 if (pxmode) *pxmode = 0.0; 01154 if (pxvariance) *pxvariance = 0.0; 01155 if (!nahisto) 01156 return ERROR_INT("nahisto not defined", procName, 1); 01157 01158 return numaGetHistogramStatsOnInterval(nahisto, startx, deltax, 0, 0, 01159 pxmean, pxmedian, pxmode, 01160 pxvariance); 01161 } 01162 01163 01164 /*! 01165 * numaGetHistogramStatsOnInterval() 01166 * 01167 * Input: nahisto (histogram: y(x(i)), i = 0 ... nbins - 1) 01168 * startx (x value of first bin: x(0)) 01169 * deltax (x increment between bins; the bin size; x(1) - x(0)) 01170 * ifirst (first bin to use for collecting stats) 01171 * ilast (last bin for collecting stats; use 0 to go to the end) 01172 * &xmean (<optional return> mean value of histogram) 01173 * &xmedian (<optional return> median value of histogram) 01174 * &xmode (<optional return> mode value of histogram: 01175 * xmode = x(imode), where y(xmode) >= y(x(i)) for 01176 * all i != imode) 01177 * &xvariance (<optional return> variance of x) 01178 * Return: 0 if OK, 1 on error 01179 * 01180 * Notes: 01181 * (1) If the histogram represents the relation y(x), the 01182 * computed values that are returned are the x values. 01183 * These are NOT the bucket indices i; they are related to the 01184 * bucket indices by 01185 * x(i) = startx + i * deltax 01186 */ 01187 l_int32 01188 numaGetHistogramStatsOnInterval(NUMA *nahisto, 01189 l_float32 startx, 01190 l_float32 deltax, 01191 l_int32 ifirst, 01192 l_int32 ilast, 01193 l_float32 *pxmean, 01194 l_float32 *pxmedian, 01195 l_float32 *pxmode, 01196 l_float32 *pxvariance) 01197 { 01198 l_int32 i, n, imax; 01199 l_float32 sum, sumval, halfsum, moment, var, x, y, ymax; 01200 01201 PROCNAME("numaGetHistogramStats"); 01202 01203 if (pxmean) *pxmean = 0.0; 01204 if (pxmedian) *pxmedian = 0.0; 01205 if (pxmode) *pxmode = 0.0; 01206 if (pxvariance) *pxvariance = 0.0; 01207 if (!nahisto) 01208 return ERROR_INT("nahisto not defined", procName, 1); 01209 if (!pxmean && !pxmedian && !pxmode && !pxvariance) 01210 return ERROR_INT("nothing to compute", procName, 1); 01211 01212 n = numaGetCount(nahisto); 01213 if (ilast <= 0) ilast = n - 1; 01214 if (ifirst < 0) ifirst = 0; 01215 if (ifirst > ilast || ifirst > n - 1) 01216 return ERROR_INT("ifirst is too large", procName, 1); 01217 for (sum = 0.0, moment = 0.0, var = 0.0, i = ifirst; i <= ilast ; i++) { 01218 x = startx + i * deltax; 01219 numaGetFValue(nahisto, i, &y); 01220 sum += y; 01221 moment += x * y; 01222 var += x * x * y; 01223 } 01224 if (sum == 0.0) 01225 return ERROR_INT("sum is 0", procName, 1); 01226 01227 if (pxmean) 01228 *pxmean = moment / sum; 01229 if (pxvariance) 01230 *pxvariance = var / sum - moment * moment / (sum * sum); 01231 01232 if (pxmedian) { 01233 halfsum = sum / 2.0; 01234 for (sumval = 0.0, i = ifirst; i <= ilast; i++) { 01235 numaGetFValue(nahisto, i, &y); 01236 sumval += y; 01237 if (sumval >= halfsum) { 01238 *pxmedian = startx + i * deltax; 01239 break; 01240 } 01241 } 01242 } 01243 01244 if (pxmode) { 01245 ymax = -1.0e10; 01246 for (i = ifirst; i <= ilast; i++) { 01247 numaGetFValue(nahisto, i, &y); 01248 if (y > ymax) { 01249 ymax = y; 01250 imax = i; 01251 } 01252 } 01253 *pxmode = startx + imax * deltax; 01254 } 01255 01256 return 0; 01257 } 01258 01259 01260 /*! 01261 * numaMakeRankFromHistogram() 01262 * 01263 * Input: startx (xval corresponding to first element in nay) 01264 * deltax (x increment between array elements in nay) 01265 * nasy (input histogram, assumed equally spaced) 01266 * npts (number of points to evaluate rank function) 01267 * &nax (<optional return> array of x values in range) 01268 * &nay (<return> rank array of specified npts) 01269 * Return: 0 if OK, 1 on error 01270 */ 01271 l_int32 01272 numaMakeRankFromHistogram(l_float32 startx, 01273 l_float32 deltax, 01274 NUMA *nasy, 01275 l_int32 npts, 01276 NUMA **pnax, 01277 NUMA **pnay) 01278 { 01279 l_int32 i, n; 01280 l_float32 sum, fval; 01281 NUMA *nan, *nar; 01282 01283 PROCNAME("numaMakeRankFromHistogram"); 01284 01285 if (pnax) *pnax = NULL; 01286 if (!pnay) 01287 return ERROR_INT("&nay not defined", procName, 1); 01288 *pnay = NULL; 01289 if (!nasy) 01290 return ERROR_INT("nasy not defined", procName, 1); 01291 if (!pnay) 01292 return ERROR_INT("&nay not defined", procName, 1); 01293 if ((n = numaGetCount(nasy)) == 0) 01294 return ERROR_INT("no bins in nas", procName, 1); 01295 01296 /* Normalize and generate the rank array corresponding to 01297 * the binned histogram. */ 01298 nan = numaNormalizeHistogram(nasy, 1.0); 01299 nar = numaCreate(n + 1); /* rank numa corresponding to nan */ 01300 sum = 0.0; 01301 numaAddNumber(nar, sum); /* first element is 0.0 */ 01302 for (i = 0; i < n; i++) { 01303 numaGetFValue(nan, i, &fval); 01304 sum += fval; 01305 numaAddNumber(nar, sum); 01306 } 01307 01308 /* Compute rank array on full range with specified 01309 * number of points and correspondence to x-values. */ 01310 numaInterpolateEqxInterval(startx, deltax, nar, L_LINEAR_INTERP, 01311 startx, startx + n * deltax, npts, 01312 pnax, pnay); 01313 numaDestroy(&nan); 01314 numaDestroy(&nar); 01315 return 0; 01316 } 01317 01318 01319 /*! 01320 * numaHistogramGetRankFromVal() 01321 * 01322 * Input: na (histogram) 01323 * rval (value of input sample for which we want the rank) 01324 * &rank (<return> fraction of total samples below rval) 01325 * Return: 0 if OK, 1 on error 01326 * 01327 * Notes: 01328 * (1) If we think of the histogram as a function y(x), normalized 01329 * to 1, for a given input value of x, this computes the 01330 * rank of x, which is the integral of y(x) from the start 01331 * value of x to the input value. 01332 * (2) This function only makes sense when applied to a Numa that 01333 * is a histogram. The values in the histogram can be ints and 01334 * floats, and are computed as floats. The rank is returned 01335 * as a float between 0.0 and 1.0. 01336 * (3) The numa parameters startx and binsize are used to 01337 * compute x from the Numa index i. 01338 */ 01339 l_int32 01340 numaHistogramGetRankFromVal(NUMA *na, 01341 l_float32 rval, 01342 l_float32 *prank) 01343 { 01344 l_int32 i, ibinval, n; 01345 l_float32 startval, binsize, binval, maxval, fractval, total, sum, val; 01346 01347 PROCNAME("numaHistogramGetRankFromVal"); 01348 01349 if (!prank) 01350 return ERROR_INT("prank not defined", procName, 1); 01351 *prank = 0.0; 01352 if (!na) 01353 return ERROR_INT("na not defined", procName, 1); 01354 numaGetXParameters(na, &startval, &binsize); 01355 n = numaGetCount(na); 01356 if (rval < startval) 01357 return 0; 01358 maxval = startval + n * binsize; 01359 if (rval > maxval) { 01360 *prank = 1.0; 01361 return 0; 01362 } 01363 01364 binval = (rval - startval) / binsize; 01365 ibinval = (l_int32)binval; 01366 if (ibinval >= n) { 01367 *prank = 1.0; 01368 return 0; 01369 } 01370 fractval = binval - (l_float32)ibinval; 01371 01372 sum = 0.0; 01373 for (i = 0; i < ibinval; i++) { 01374 numaGetFValue(na, i, &val); 01375 sum += val; 01376 } 01377 numaGetFValue(na, ibinval, &val); 01378 sum += fractval * val; 01379 numaGetSum(na, &total); 01380 *prank = sum / total; 01381 01382 /* fprintf(stderr, "binval = %7.3f, rank = %7.3f\n", binval, *prank); */ 01383 01384 return 0; 01385 } 01386 01387 01388 /*! 01389 * numaHistogramGetValFromRank() 01390 * 01391 * Input: na (histogram) 01392 * rank (fraction of total samples) 01393 * &rval (<return> approx. to the bin value) 01394 * Return: 0 if OK, 1 on error 01395 * 01396 * Notes: 01397 * (1) If we think of the histogram as a function y(x), this returns 01398 * the value x such that the integral of y(x) from the start 01399 * value to x gives the fraction 'rank' of the integral 01400 * of y(x) over all bins. 01401 * (2) This function only makes sense when applied to a Numa that 01402 * is a histogram. The values in the histogram can be ints and 01403 * floats, and are computed as floats. The val is returned 01404 * as a float, even though the buckets are of integer width. 01405 * (3) The numa parameters startx and binsize are used to 01406 * compute x from the Numa index i. 01407 */ 01408 l_int32 01409 numaHistogramGetValFromRank(NUMA *na, 01410 l_float32 rank, 01411 l_float32 *prval) 01412 { 01413 l_int32 i, n; 01414 l_float32 startval, binsize, rankcount, total, sum, fract, val; 01415 01416 PROCNAME("numaHistogramGetValFromRank"); 01417 01418 if (!prval) 01419 return ERROR_INT("prval not defined", procName, 1); 01420 *prval = 0.0; 01421 if (!na) 01422 return ERROR_INT("na not defined", procName, 1); 01423 if (rank < 0.0) { 01424 L_WARNING("rank < 0; setting to 0.0", procName); 01425 rank = 0.0; 01426 } 01427 if (rank > 1.0) { 01428 L_WARNING("rank > 1.0; setting to 1.0", procName); 01429 rank = 1.0; 01430 } 01431 01432 n = numaGetCount(na); 01433 numaGetXParameters(na, &startval, &binsize); 01434 numaGetSum(na, &total); 01435 rankcount = rank * total; /* count that corresponds to rank */ 01436 sum = 0.0; 01437 for (i = 0; i < n; i++) { 01438 numaGetFValue(na, i, &val); 01439 if (sum + val >= rankcount) 01440 break; 01441 sum += val; 01442 } 01443 if (val <= 0.0) /* can == 0 if rank == 0.0 */ 01444 fract = 0.0; 01445 else /* sum + fract * val = rankcount */ 01446 fract = (rankcount - sum) / val; 01447 01448 /* The use of the fraction of a bin allows a simple calculation 01449 * for the histogram value at the given rank. */ 01450 *prval = startval + binsize * ((l_float32)i + fract); 01451 01452 /* fprintf(stderr, "rank = %7.3f, val = %7.3f\n", rank, *prval); */ 01453 01454 return 0; 01455 } 01456 01457 01458 /*! 01459 * numaDiscretizeRankAndIntensity() 01460 * 01461 * Input: na (normalized histogram of probability density vs intensity) 01462 * nbins (number of bins at which the rank is divided) 01463 * &pnarbin (<optional return> rank bin value vs intensity) 01464 * &pnam (<optional return> median intensity in a bin vs 01465 * rank bin value, with @nbins of discretized rank values) 01466 * &pnar (<optional return> rank vs intensity; this is 01467 * a cumulative norm histogram) 01468 * &pnabb (<optional return> intensity at the right bin boundary 01469 * vs rank bin) 01470 * Return: 0 if OK, 1 on error 01471 * 01472 * Notes: 01473 * (1) We are inverting the rank(intensity) function to get 01474 * the intensity(rank) function at @nbins equally spaced 01475 * values of rank between 0.0 and 1.0. We save integer values 01476 * for the intensity. 01477 * (2) We are using the word "intensity" to describe the type of 01478 * array values, but any array of non-negative numbers will work. 01479 * (3) The output arrays give the following mappings, where the 01480 * input is a normalized histogram of array values: 01481 * array values --> rank bin number (narbin) 01482 * rank bin number --> median array value in bin (nam) 01483 * array values --> cumulative norm = rank (nar) 01484 * rank bin number --> array value at right bin edge (nabb) 01485 */ 01486 l_int32 01487 numaDiscretizeRankAndIntensity(NUMA *na, 01488 l_int32 nbins, 01489 NUMA **pnarbin, 01490 NUMA **pnam, 01491 NUMA **pnar, 01492 NUMA **pnabb) 01493 { 01494 NUMA *nar; /* rank value as function of intensity */ 01495 NUMA *nam; /* median intensity in the rank bins */ 01496 NUMA *nabb; /* rank bin right boundaries (in intensity) */ 01497 NUMA *narbin; /* binned rank value as a function of intensity */ 01498 l_int32 i, j, npts, start, midfound, mcount, rightedge; 01499 l_float32 sum, midrank, endrank, val; 01500 01501 PROCNAME("numaDiscretizeRankAndIntensity"); 01502 01503 if (!na) 01504 return ERROR_INT("na not defined", procName, 1); 01505 if (nbins < 2) 01506 return ERROR_INT("nbins must be > 1", procName, 1); 01507 if (!pnarbin && !pnam && !pnar && !pnabb) 01508 return ERROR_INT("no output requested", procName, 1); 01509 01510 /* Get cumulative normalized histogram (rank vs intensity value). 01511 * For a normalized histogram from an 8 bpp grayscale image 01512 * as input, we have 256 bins and 257 points in the 01513 * cumulative (rank) histogram. */ 01514 npts = numaGetCount(na); 01515 nar = numaCreate(npts + 1); 01516 sum = 0.0; 01517 numaAddNumber(nar, sum); /* left side of first bin */ 01518 for (i = 0; i < npts; i++) { 01519 numaGetFValue(na, i, &val); 01520 sum += val; 01521 numaAddNumber(nar, sum); 01522 } 01523 01524 if ((nam = numaCreate(nbins)) == NULL) 01525 return ERROR_INT("nam not made", procName, 1); 01526 if ((narbin = numaCreate(npts)) == NULL) 01527 return ERROR_INT("narbin not made", procName, 1); 01528 if ((nabb = numaCreate(nbins)) == NULL) 01529 return ERROR_INT("nabb not made", procName, 1); 01530 01531 /* We find the intensity value at the right edge of each of 01532 * the rank bins. We also find the median intensity in the bin, 01533 * where approximately half the samples are lower and half are 01534 * higher. This can be considered as a simple approximation 01535 * for the average intensity in the bin. */ 01536 start = 0; /* index in nar */ 01537 mcount = 0; /* count of median values in rank bins; not to exceed nbins */ 01538 for (i = 0; i < nbins; i++) { 01539 midrank = (l_float32)(i + 0.5) / (l_float32)(nbins); 01540 endrank = (l_float32)(i + 1.0) / (l_float32)(nbins); 01541 endrank = L_MAX(0.0, L_MIN(endrank - 0.001, 1.0)); 01542 midfound = FALSE; 01543 for (j = start; j < npts; j++) { /* scan up for each bin value */ 01544 numaGetFValue(nar, j, &val); 01545 /* Use (j == npts - 1) tests in case all weight is at top end */ 01546 if ((!midfound && val >= midrank) || 01547 (mcount < nbins && j == npts - 1)) { 01548 midfound = TRUE; 01549 numaAddNumber(nam, j); 01550 mcount++; 01551 } 01552 if ((val >= endrank) || (j == npts - 1)) { 01553 numaAddNumber(nabb, j); 01554 if (val == endrank) 01555 start = j; 01556 else 01557 start = j - 1; 01558 break; 01559 } 01560 } 01561 } 01562 numaSetValue(nabb, nbins - 1, npts - 1); /* extend to max */ 01563 01564 /* Error checking: did we get data in all bins? */ 01565 if (mcount != nbins) 01566 L_WARNING_INT2("found data for %d bins; should be %d", 01567 procName, mcount, nbins); 01568 01569 /* Generate LUT that maps from intensity to bin number */ 01570 start = 0; 01571 for (i = 0; i < nbins; i++) { 01572 numaGetIValue(nabb, i, &rightedge); 01573 for (j = start; j < npts; j++) { 01574 if (j <= rightedge) 01575 numaAddNumber(narbin, i); 01576 if (j > rightedge) { 01577 start = j; 01578 break; 01579 } 01580 if (j == npts - 1) { /* we're done */ 01581 start = j + 1; 01582 break; 01583 } 01584 } 01585 } 01586 01587 if (pnarbin) 01588 *pnarbin = narbin; 01589 else 01590 numaDestroy(&narbin); 01591 if (pnam) 01592 *pnam = nam; 01593 else 01594 numaDestroy(&nam); 01595 if (pnar) 01596 *pnar = nar; 01597 else 01598 numaDestroy(&nar); 01599 if (pnabb) 01600 *pnabb = nabb; 01601 else 01602 numaDestroy(&nabb); 01603 return 0; 01604 } 01605 01606 01607 /*! 01608 * numaGetRankBinValues() 01609 * 01610 * Input: na (just an array of values) 01611 * nbins (number of bins at which the rank is divided) 01612 * &pnarbin (<optional return> rank bin value vs array value) 01613 * &pnam (<optional return> median intensity in a bin vs 01614 * rank bin value, with @nbins of discretized rank values) 01615 * Return: 0 if OK, 1 on error 01616 * 01617 * Notes: 01618 * (1) Simple interface for getting a binned rank representation 01619 * of an input array of values. This returns two mappings: 01620 * array value --> rank bin number (narbin) 01621 * rank bin number --> median array value in each rank bin (nam) 01622 */ 01623 l_int32 01624 numaGetRankBinValues(NUMA *na, 01625 l_int32 nbins, 01626 NUMA **pnarbin, 01627 NUMA **pnam) 01628 { 01629 NUMA *nah, *nan; /* histo and normalized histo */ 01630 l_int32 maxbins, discardval; 01631 l_float32 maxval, delx; 01632 01633 PROCNAME("numaGetRankBinValues"); 01634 01635 if (pnarbin) *pnarbin = NULL; 01636 if (pnam) *pnam = NULL; 01637 if (!pnarbin && !pnam) 01638 return ERROR_INT("no output requested", procName, 1); 01639 if (!na) 01640 return ERROR_INT("na not defined", procName, 1); 01641 if (numaGetCount(na) == 0) 01642 return ERROR_INT("na is empty", procName, 1); 01643 if (nbins < 2) 01644 return ERROR_INT("nbins must be > 1", procName, 1); 01645 01646 /* Get normalized histogram */ 01647 numaGetMax(na, &maxval, NULL); 01648 maxbins = L_MIN(100002, (l_int32)maxval + 2); 01649 nah = numaMakeHistogram(na, maxbins, &discardval, NULL); 01650 nan = numaNormalizeHistogram(nah, 1.0); 01651 01652 /* Warn if there is a scale change. This shouldn't happen 01653 * unless the max value is above 100000. */ 01654 numaGetXParameters(nan, NULL, &delx); 01655 if (delx > 1.0) 01656 L_WARNING_FLOAT("scale change: delx = %6.2f", procName, delx); 01657 01658 /* Rank bin the results */ 01659 numaDiscretizeRankAndIntensity(nan, nbins, pnarbin, pnam, NULL, NULL); 01660 numaDestroy(&nah); 01661 numaDestroy(&nan); 01662 return 0; 01663 } 01664 01665 01666 /*----------------------------------------------------------------------* 01667 * Splitting a distribution * 01668 *----------------------------------------------------------------------*/ 01669 /*! 01670 * numaSplitDistribution() 01671 * 01672 * Input: na (histogram) 01673 * scorefract (fraction of the max score, used to determine 01674 * the range over which the histogram min is searched) 01675 * &splitindex (<optional return> index for splitting) 01676 * &ave1 (<optional return> average of lower distribution) 01677 * &ave2 (<optional return> average of upper distribution) 01678 * &num1 (<optional return> population of lower distribution) 01679 * &num2 (<optional return> population of upper distribution) 01680 * &nascore (<optional return> for debugging; otherwise use null) 01681 * Return: 0 if OK, 1 on error 01682 * 01683 * Notes: 01684 * (1) This function is intended to be used on a distribution of 01685 * values that represent two sets, such as a histogram of 01686 * pixel values for an image with a fg and bg, and the goal 01687 * is to determine the averages of the two sets and the 01688 * best splitting point. 01689 * (2) The Otsu method finds a split point that divides the distribution 01690 * into two parts by maximizing a score function that is the 01691 * product of two terms: 01692 * (a) the square of the difference of centroids, (ave1 - ave2)^2 01693 * (b) fract1 * (1 - fract1) 01694 * where fract1 is the fraction in the lower distribution. 01695 * (3) This works well for images where the fg and bg are 01696 * each relatively homogeneous and well-separated in color. 01697 * However, if the actual fg and bg sets are very different 01698 * in size, and the bg is highly varied, as can occur in some 01699 * scanned document images, this will bias the split point 01700 * into the larger "bump" (i.e., toward the point where the 01701 * (b) term reaches its maximum of 0.25 at fract1 = 0.5. 01702 * To avoid this, we define a range of values near the 01703 * maximum of the score function, and choose the value within 01704 * this range such that the histogram itself has a minimum value. 01705 * The range is determined by scorefract: we include all abscissa 01706 * values to the left and right of the value that maximizes the 01707 * score, such that the score stays above (1 - scorefract) * maxscore. 01708 * The intuition behind this modification is to try to find 01709 * a split point that both has a high variance score and is 01710 * at or near a minimum in the histogram, so that the histogram 01711 * slope is small at the split point. 01712 * (4) We normalize the score so that if the two distributions 01713 * were of equal size and at opposite ends of the numa, the 01714 * score would be 1.0. 01715 */ 01716 l_int32 01717 numaSplitDistribution(NUMA *na, 01718 l_float32 scorefract, 01719 l_int32 *psplitindex, 01720 l_float32 *pave1, 01721 l_float32 *pave2, 01722 l_float32 *pnum1, 01723 l_float32 *pnum2, 01724 NUMA **pnascore) 01725 { 01726 l_int32 i, n, bestsplit, minrange, maxrange, maxindex; 01727 l_float32 ave1, ave2, ave1prev, ave2prev; 01728 l_float32 num1, num2, num1prev, num2prev; 01729 l_float32 val, minval, sum, fract1; 01730 l_float32 norm, score, minscore, maxscore; 01731 NUMA *nascore, *naave1, *naave2, *nanum1, *nanum2; 01732 01733 PROCNAME("numaSplitDistribution"); 01734 01735 if (!na) 01736 return ERROR_INT("na not defined", procName, 1); 01737 01738 n = numaGetCount(na); 01739 if (n <= 1) 01740 return ERROR_INT("n = 1 in histogram", procName, 1); 01741 numaGetSum(na, &sum); 01742 if (sum <= 0.0) 01743 return ERROR_INT("sum <= 0.0", procName, 1); 01744 norm = 4.0 / ((n - 1) * (n - 1)); 01745 ave1prev = 0.0; 01746 numaGetHistogramStats(na, 0.0, 1.0, &ave2prev, NULL, NULL, NULL); 01747 num1prev = 0.0; 01748 num2prev = sum; 01749 maxindex = n / 2; /* initialize with something */ 01750 01751 /* Split the histogram with [0 ... i] in the lower part 01752 * and [i+1 ... n-1] in upper part. First, compute an otsu 01753 * score for each possible splitting. */ 01754 nascore = numaCreate(n); 01755 if (pave2) naave1 = numaCreate(n); 01756 if (pave2) naave2 = numaCreate(n); 01757 if (pnum1) nanum1 = numaCreate(n); 01758 if (pnum2) nanum2 = numaCreate(n); 01759 maxscore = 0.0; 01760 for (i = 0; i < n; i++) { 01761 numaGetFValue(na, i, &val); 01762 num1 = num1prev + val; 01763 if (num1 == 0) 01764 ave1 = ave1prev; 01765 else 01766 ave1 = (num1prev * ave1prev + i * val) / num1; 01767 num2 = num2prev - val; 01768 if (num2 == 0) 01769 ave2 = ave2prev; 01770 else 01771 ave2 = (num2prev * ave2prev - i * val) / num2; 01772 fract1 = num1 / sum; 01773 score = norm * (fract1 * (1 - fract1)) * (ave2 - ave1) * (ave2 - ave1); 01774 numaAddNumber(nascore, score); 01775 if (pave1) numaAddNumber(naave1, ave1); 01776 if (pave2) numaAddNumber(naave2, ave2); 01777 if (pnum1) numaAddNumber(nanum1, num1); 01778 if (pnum1) numaAddNumber(nanum2, num2); 01779 if (score > maxscore) { 01780 maxscore = score; 01781 maxindex = i; 01782 } 01783 num1prev = num1; 01784 num2prev = num2; 01785 ave1prev = ave1; 01786 ave2prev = ave2; 01787 } 01788 01789 /* Next, for all contiguous scores within a specified fraction 01790 * of the max, choose the split point as the value with the 01791 * minimum in the histogram. */ 01792 minscore = (1. - scorefract) * maxscore; 01793 for (i = maxindex - 1; i >= 0; i--) { 01794 numaGetFValue(nascore, i, &val); 01795 if (val < minscore) 01796 break; 01797 } 01798 minrange = i + 1; 01799 for (i = maxindex + 1; i < n; i++) { 01800 numaGetFValue(nascore, i, &val); 01801 if (val < minscore) 01802 break; 01803 } 01804 maxrange = i - 1; 01805 numaGetFValue(na, minrange, &minval); 01806 bestsplit = minrange; 01807 for (i = minrange + 1; i <= maxrange; i++) { 01808 numaGetFValue(na, i, &val); 01809 if (val < minval) { 01810 minval = val; 01811 bestsplit = i; 01812 } 01813 } 01814 01815 /* Add one to the bestsplit value to get the threshold value, 01816 * because when we take a threshold, as in pixThresholdToBinary(), 01817 * we always choose the set with values below the threshold. */ 01818 bestsplit = L_MIN(255, bestsplit + 1); 01819 01820 if (psplitindex) *psplitindex = bestsplit; 01821 if (pave1) numaGetFValue(naave1, bestsplit, pave1); 01822 if (pave2) numaGetFValue(naave2, bestsplit, pave2); 01823 if (pnum1) numaGetFValue(nanum1, bestsplit, pnum1); 01824 if (pnum2) numaGetFValue(nanum2, bestsplit, pnum2); 01825 01826 if (pnascore) { /* debug mode */ 01827 fprintf(stderr, "minrange = %d, maxrange = %d\n", minrange, maxrange); 01828 fprintf(stderr, "minval = %10.0f\n", minval); 01829 gplotSimple1(nascore, GPLOT_PNG, "/tmp/nascore", 01830 "Score for split distribution"); 01831 *pnascore = nascore; 01832 } 01833 else 01834 numaDestroy(&nascore); 01835 01836 if (pave1) numaDestroy(&naave1); 01837 if (pave2) numaDestroy(&naave2); 01838 if (pnum1) numaDestroy(&nanum1); 01839 if (pnum2) numaDestroy(&nanum2); 01840 return 0; 01841 } 01842 01843 01844 /*----------------------------------------------------------------------* 01845 * Extrema finding * 01846 *----------------------------------------------------------------------*/ 01847 /*! 01848 * numaFindPeaks() 01849 * 01850 * Input: source na 01851 * max number of peaks to be found 01852 * fract1 (min fraction of peak value) 01853 * fract2 (min slope) 01854 * Return: peak na, or null on error. 01855 * 01856 * Notes: 01857 * (1) The returned na consists of sets of four numbers representing 01858 * the peak, in the following order: 01859 * left edge; peak center; right edge; normalized peak area 01860 */ 01861 NUMA * 01862 numaFindPeaks(NUMA *nas, 01863 l_int32 nmax, 01864 l_float32 fract1, 01865 l_float32 fract2) 01866 { 01867 l_int32 i, k, n, maxloc, lloc, rloc; 01868 l_float32 fmaxval, sum, total, newtotal, val, lastval; 01869 l_float32 peakfract; 01870 NUMA *na, *napeak; 01871 01872 PROCNAME("numaFindPeaks"); 01873 01874 if (!nas) 01875 return (NUMA *)ERROR_PTR("nas not defined", procName, NULL); 01876 n = numaGetCount(nas); 01877 numaGetSum(nas, &total); 01878 01879 /* We munge this copy */ 01880 if ((na = numaCopy(nas)) == NULL) 01881 return (NUMA *)ERROR_PTR("na not made", procName, NULL); 01882 if ((napeak = numaCreate(4 * nmax)) == NULL) 01883 return (NUMA *)ERROR_PTR("napeak not made", procName, NULL); 01884 01885 for (k = 0; k < nmax; k++) { 01886 numaGetSum(na, &newtotal); 01887 if (newtotal == 0.0) /* sanity check */ 01888 break; 01889 numaGetMax(na, &fmaxval, &maxloc); 01890 sum = fmaxval; 01891 lastval = fmaxval; 01892 lloc = 0; 01893 for (i = maxloc - 1; i >= 0; --i) { 01894 numaGetFValue(na, i, &val); 01895 if (val == 0.0) { 01896 lloc = i + 1; 01897 break; 01898 } 01899 if (val > fract1 * fmaxval) { 01900 sum += val; 01901 lastval = val; 01902 continue; 01903 } 01904 if (lastval - val > fract2 * lastval) { 01905 sum += val; 01906 lastval = val; 01907 continue; 01908 } 01909 lloc = i; 01910 break; 01911 } 01912 lastval = fmaxval; 01913 rloc = n - 1; 01914 for (i = maxloc + 1; i < n; ++i) { 01915 numaGetFValue(na, i, &val); 01916 if (val == 0.0) { 01917 rloc = i - 1; 01918 break; 01919 } 01920 if (val > fract1 * fmaxval) { 01921 sum += val; 01922 lastval = val; 01923 continue; 01924 } 01925 if (lastval - val > fract2 * lastval) { 01926 sum += val; 01927 lastval = val; 01928 continue; 01929 } 01930 rloc = i; 01931 break; 01932 } 01933 peakfract = sum / total; 01934 numaAddNumber(napeak, lloc); 01935 numaAddNumber(napeak, maxloc); 01936 numaAddNumber(napeak, rloc); 01937 numaAddNumber(napeak, peakfract); 01938 01939 for (i = lloc; i <= rloc; i++) 01940 numaSetValue(na, i, 0.0); 01941 } 01942 01943 numaDestroy(&na); 01944 return napeak; 01945 } 01946 01947 01948 /*! 01949 * numaFindExtrema() 01950 * 01951 * Input: nas (input values) 01952 * delta (relative amount to resolve peaks and valleys) 01953 * Return: nad (locations of extrema), or null on error 01954 * 01955 * Notes: 01956 * (1) This returns a sequence of extrema (peaks and valleys). 01957 * (2) The algorithm is analogous to that for determining 01958 * mountain peaks. Suppose we have a local peak, with 01959 * bumps on the side. Under what conditions can we consider 01960 * those 'bumps' to be actual peaks? The answer: if the 01961 * bump is separated from the peak by a saddle that is at 01962 * least 500 feet below the bump. 01963 * (3) Operationally, suppose we are looking for a peak. 01964 * We are keeping the largest value we've seen since the 01965 * last valley, and are looking for a value that is delta 01966 * BELOW our current peak. When we find such a value, 01967 * we label the peak, use the current value to label the 01968 * valley, and then do the same operation in reverse (looking 01969 * for a valley). 01970 */ 01971 NUMA * 01972 numaFindExtrema(NUMA *nas, 01973 l_float32 delta) 01974 { 01975 l_int32 i, n, found, loc, direction; 01976 l_float32 startval, val, maxval, minval; 01977 NUMA *nad; 01978 01979 PROCNAME("numaFindExtrema"); 01980 01981 if (!nas) 01982 return (NUMA *)ERROR_PTR("nas not defined", procName, NULL); 01983 01984 n = numaGetCount(nas); 01985 nad = numaCreate(0); 01986 01987 /* We don't know if we'll find a peak or valley first, 01988 * but use the first element of nas as the reference point. 01989 * Break when we deviate by 'delta' from the first point. */ 01990 numaGetFValue(nas, 0, &startval); 01991 found = FALSE; 01992 for (i = 1; i < n; i++) { 01993 numaGetFValue(nas, i, &val); 01994 if (L_ABS(val - startval) >= delta) { 01995 found = TRUE; 01996 break; 01997 } 01998 } 01999 02000 if (!found) 02001 return nad; /* it's empty */ 02002 02003 /* Are we looking for a peak or a valley? */ 02004 if (val > startval) { /* peak */ 02005 direction = 1; 02006 maxval = val; 02007 } 02008 else { 02009 direction = -1; 02010 minval = val; 02011 } 02012 loc = i; 02013 02014 /* Sweep through the rest of the array, recording alternating 02015 * peak/valley extrema. */ 02016 for (i = i + 1; i < n; i++) { 02017 numaGetFValue(nas, i, &val); 02018 if (direction == 1 && val > maxval ) { /* new local max */ 02019 maxval = val; 02020 loc = i; 02021 } 02022 else if (direction == -1 && val < minval ) { /* new local min */ 02023 minval = val; 02024 loc = i; 02025 } 02026 else if (direction == 1 && (maxval - val >= delta)) { 02027 numaAddNumber(nad, loc); /* save the current max location */ 02028 direction = -1; /* reverse: start looking for a min */ 02029 minval = val; 02030 loc = i; /* current min location */ 02031 } 02032 else if (direction == -1 && (val - minval >= delta)) { 02033 numaAddNumber(nad, loc); /* save the current min location */ 02034 direction = 1; /* reverse: start looking for a max */ 02035 maxval = val; 02036 loc = i; /* current max location */ 02037 } 02038 } 02039 02040 /* Save the final extremum */ 02041 /* numaAddNumber(nad, loc); */ 02042 return nad; 02043 } 02044 02045 02046 /*! 02047 * numaCountReversals() 02048 * 02049 * Input: nas (input values) 02050 * minreversal (relative amount to resolve peaks and valleys) 02051 * &nr (<optional return> number of reversals 02052 * &nrpl (<optional return> reversal density: reversals/length) 02053 * Return: 0 if OK, 1 on error 02054 * 02055 * Notes: 02056 * (1) The input numa is can be generated from pixExtractAlongLine(). 02057 * If so, the x parameters can be used to find the reversal 02058 * frequency along a line. 02059 */ 02060 l_int32 02061 numaCountReversals(NUMA *nas, 02062 l_float32 minreversal, 02063 l_int32 *pnr, 02064 l_float32 *pnrpl) 02065 { 02066 l_int32 n, nr; 02067 l_float32 delx, len; 02068 NUMA *nat; 02069 02070 PROCNAME("numaCountReversals"); 02071 02072 if (!pnr && !pnrpl) 02073 return ERROR_INT("neither &nr nor &nrpl are defined", procName, 1); 02074 if (pnr) *pnr = 0; 02075 if (pnrpl) *pnrpl = 0.0; 02076 if (!nas) 02077 return ERROR_INT("nas not defined", procName, 1); 02078 02079 n = numaGetCount(nas); 02080 nat = numaFindExtrema(nas, minreversal); 02081 nr = numaGetCount(nat); 02082 if (pnr) *pnr = nr; 02083 if (pnrpl) { 02084 numaGetXParameters(nas, NULL, &delx); 02085 len = delx * n; 02086 *pnrpl = (l_float32)nr / len; 02087 } 02088 02089 numaDestroy(&nat); 02090 return 0; 02091 } 02092 02093 02094 /*----------------------------------------------------------------------* 02095 * Threshold crossings and frequency analysis * 02096 *----------------------------------------------------------------------*/ 02097 /*! 02098 * numaSelectCrossingThreshold() 02099 * 02100 * Input: nax (<optional> numa of abscissa values; can be NULL) 02101 * nay (signal) 02102 * estthresh (estimated pixel threshold for crossing: e.g., for 02103 * images, white <--> black; typ. ~120) 02104 * &bestthresh (<return> robust estimate of threshold to use) 02105 * Return: 0 if OK, 1 on error 02106 * 02107 * Note: 02108 * (1) When a valid threshold is used, the number of crossings is 02109 * a maximum, because none are missed. If no threshold intersects 02110 * all the crossings, the crossings must be determined with 02111 * numaCrossingsByPeaks(). 02112 * (2) @estthresh is an input estimate of the threshold that should 02113 * be used. We compute the crossings with 41 thresholds 02114 * (20 below and 20 above). There is a range in which the 02115 * number of crossings is a maximum. Return a threshold 02116 * in the center of this stable plateau of crossings. 02117 * This can then be used with numaCrossingsByThreshold() 02118 * to get a good estimate of crossing locations. 02119 */ 02120 l_int32 02121 numaSelectCrossingThreshold(NUMA *nax, 02122 NUMA *nay, 02123 l_float32 estthresh, 02124 l_float32 *pbestthresh) 02125 { 02126 l_int32 i, inrun, istart, iend, maxstart, maxend, runlen, maxrunlen; 02127 l_int32 val, maxval, nmax, count; 02128 l_float32 thresh, fmaxval, fmodeval; 02129 NUMA *nat, *nac; 02130 02131 PROCNAME("numaSelectCrossingThreshold"); 02132 02133 if (!nay) 02134 return ERROR_INT("nay not defined", procName, 1); 02135 02136 /* Compute the number of crossings for different thresholds */ 02137 nat = numaCreate(41); 02138 for (i = 0; i < 41; i++) { 02139 thresh = estthresh - 80.0 + 4.0 * i; 02140 nac = numaCrossingsByThreshold(nax, nay, thresh); 02141 numaAddNumber(nat, numaGetCount(nac)); 02142 numaDestroy(&nac); 02143 } 02144 02145 /* Find the center of the plateau of max crossings, which 02146 * extends from thresh[istart] to thresh[iend]. */ 02147 numaGetMax(nat, &fmaxval, NULL); 02148 maxval = (l_int32)fmaxval; 02149 nmax = 0; 02150 for (i = 0; i < 41; i++) { 02151 numaGetIValue(nat, i, &val); 02152 if (val == maxval) 02153 nmax++; 02154 } 02155 if (nmax < 3) { /* likely accidental max; try the mode */ 02156 numaGetMode(nat, &fmodeval, &count); 02157 if (count > nmax && fmodeval > 0.5 * fmaxval) 02158 maxval = (l_int32)fmodeval; /* use the mode */ 02159 } 02160 02161 inrun = FALSE; 02162 iend = 40; 02163 maxrunlen = 0; 02164 for (i = 0; i < 41; i++) { 02165 numaGetIValue(nat, i, &val); 02166 if (val == maxval) { 02167 if (!inrun) { 02168 istart = i; 02169 inrun = TRUE; 02170 } 02171 continue; 02172 } 02173 if (inrun && (val != maxval)) { 02174 iend = i - 1; 02175 runlen = iend - istart + 1; 02176 inrun = FALSE; 02177 if (runlen > maxrunlen) { 02178 maxstart = istart; 02179 maxend = iend; 02180 maxrunlen = runlen; 02181 } 02182 } 02183 } 02184 if (inrun) { 02185 runlen = i - istart; 02186 if (runlen > maxrunlen) { 02187 maxstart = istart; 02188 maxend = i - 1; 02189 maxrunlen = runlen; 02190 } 02191 } 02192 02193 #if 0 02194 foundfirst = FALSE; 02195 iend = 40; 02196 for (i = 0; i < 41; i++) { 02197 numaGetIValue(nat, i, &val); 02198 if (val == maxval) { 02199 if (!foundfirst) { 02200 istart = i; 02201 foundfirst = TRUE; 02202 } 02203 } 02204 if ((val != maxval) && foundfirst) { 02205 iend = i - 1; 02206 break; 02207 } 02208 } 02209 nmax = iend - istart + 1; 02210 #endif 02211 02212 *pbestthresh = estthresh - 80.0 + 2.0 * (l_float32)(maxstart + maxend); 02213 02214 #if DEBUG_CROSSINGS 02215 fprintf(stderr, "\nCrossings attain a maximum at %d thresholds, between:\n" 02216 " thresh[%d] = %5.1f and thresh[%d] = %5.1f\n", 02217 nmax, maxstart, estthresh - 80.0 + 4.0 * maxstart, 02218 maxend, estthresh - 80.0 + 4.0 * maxend); 02219 fprintf(stderr, "The best choice: %5.1f\n", *pbestthresh); 02220 fprintf(stderr, "Number of crossings at the 41 thresholds:"); 02221 numaWriteStream(stderr, nat); 02222 #endif /* DEBUG_CROSSINGS */ 02223 02224 numaDestroy(&nat); 02225 return 0; 02226 } 02227 02228 02229 /*! 02230 * numaCrossingsByThreshold() 02231 * 02232 * Input: nax (<optional> numa of abscissa values; can be NULL) 02233 * nay (numa of ordinate values, corresponding to nax) 02234 * thresh (threshold value for nay) 02235 * Return: nad (abscissa pts at threshold), or null on error 02236 * 02237 * Notes: 02238 * (1) If nax == NULL, we use startx and delx from nay to compute 02239 * the crossing values in nad. 02240 */ 02241 NUMA * 02242 numaCrossingsByThreshold(NUMA *nax, 02243 NUMA *nay, 02244 l_float32 thresh) 02245 { 02246 l_int32 i, n; 02247 l_float32 startx, delx; 02248 l_float32 xval1, xval2, yval1, yval2, delta1, delta2, crossval, fract; 02249 NUMA *nad; 02250 02251 PROCNAME("numaCrossingsByThreshold"); 02252 02253 if (!nay) 02254 return (NUMA *)ERROR_PTR("nay not defined", procName, NULL); 02255 n = numaGetCount(nay); 02256 02257 if (nax && (numaGetCount(nax) != n)) 02258 return (NUMA *)ERROR_PTR("nax and nay sizes differ", procName, NULL); 02259 02260 nad = numaCreate(0); 02261 numaGetFValue(nay, 0, &yval1); 02262 numaGetXParameters(nay, &startx, &delx); 02263 if (nax) 02264 numaGetFValue(nax, 0, &xval1); 02265 else 02266 xval1 = startx; 02267 for (i = 1; i < n; i++) { 02268 numaGetFValue(nay, i, &yval2); 02269 if (nax) 02270 numaGetFValue(nax, i, &xval2); 02271 else 02272 xval2 = startx + i * delx; 02273 delta1 = yval1 - thresh; 02274 delta2 = yval2 - thresh; 02275 if (delta1 == 0.0) 02276 numaAddNumber(nad, xval1); 02277 else if (delta2 == 0.0) 02278 numaAddNumber(nad, xval2); 02279 else if (delta1 * delta2 < 0.0) { /* crossing */ 02280 fract = L_ABS(delta1) / L_ABS(yval1 - yval2); 02281 crossval = xval1 + fract * (xval2 - xval1); 02282 numaAddNumber(nad, crossval); 02283 } 02284 xval1 = xval2; 02285 yval1 = yval2; 02286 } 02287 02288 return nad; 02289 } 02290 02291 02292 /*! 02293 * numaCrossingsByPeaks() 02294 * 02295 * Input: nax (<optional> numa of abscissa values) 02296 * nay (numa of ordinate values, corresponding to nax) 02297 * delta (parameter used to identify when a new peak can be found) 02298 * Return: nad (abscissa pts at threshold), or null on error 02299 * 02300 * Notes: 02301 * (1) If nax == NULL, we use startx and delx from nay to compute 02302 * the crossing values in nad. 02303 */ 02304 NUMA * 02305 numaCrossingsByPeaks(NUMA *nax, 02306 NUMA *nay, 02307 l_float32 delta) 02308 { 02309 l_int32 i, j, n, np, previndex, curindex; 02310 l_float32 startx, delx; 02311 l_float32 xval1, xval2, yval1, yval2, delta1, delta2; 02312 l_float32 prevval, curval, thresh, crossval, fract; 02313 NUMA *nap, *nad; 02314 02315 PROCNAME("numaCrossingsByPeaks"); 02316 02317 if (!nax) 02318 return (NUMA *)ERROR_PTR("nax not defined", procName, NULL); 02319 if (!nay) 02320 return (NUMA *)ERROR_PTR("nay not defined", procName, NULL); 02321 02322 n = numaGetCount(nax); 02323 if (numaGetCount(nay) != n) 02324 return (NUMA *)ERROR_PTR("nax and nay sizes differ", procName, NULL); 02325 02326 /* Find the extrema. Also add last point in nay to get 02327 * the last transition (from the last peak to the end). 02328 * The number of crossings is 1 more than the number of extrema. */ 02329 nap = numaFindExtrema(nay, delta); 02330 numaAddNumber(nap, n - 1); 02331 np = numaGetCount(nap); 02332 L_INFO_INT("Number of crossings: %d", procName, np); 02333 02334 /* Do all computation in index units of nax */ 02335 nad = numaCreate(np); /* output crossings, in nax units */ 02336 previndex = 0; /* prime the search with 1st point */ 02337 numaGetFValue(nay, 0, &prevval); /* prime the search with 1st point */ 02338 numaGetXParameters(nay, &startx, &delx); 02339 for (i = 0; i < np; i++) { 02340 numaGetIValue(nap, i, &curindex); 02341 numaGetFValue(nay, curindex, &curval); 02342 thresh = (prevval + curval) / 2.0; 02343 /* fprintf(stderr, "thresh[%d] = %7.3f\n", i, thresh); */ 02344 if (nax) 02345 numaGetFValue(nax, previndex, &xval1); 02346 else 02347 xval1 = startx + previndex * delx; 02348 numaGetFValue(nay, previndex, &yval1); 02349 for (j = previndex + 1; j <= curindex; j++) { 02350 if (nax) 02351 numaGetFValue(nax, j, &xval2); 02352 else 02353 xval2 = startx + j * delx; 02354 numaGetFValue(nay, j, &yval2); 02355 delta1 = yval1 - thresh; 02356 delta2 = yval2 - thresh; 02357 if (delta1 == 0.0) { 02358 numaAddNumber(nad, xval1); 02359 break; 02360 } 02361 else if (delta2 == 0.0) { 02362 numaAddNumber(nad, xval2); 02363 break; 02364 } 02365 else if (delta1 * delta2 < 0.0) { /* crossing */ 02366 fract = L_ABS(delta1) / L_ABS(yval1 - yval2); 02367 crossval = xval1 + fract * (xval2 - xval1); 02368 numaAddNumber(nad, crossval); 02369 break; 02370 } 02371 xval1 = xval2; 02372 yval1 = yval2; 02373 } 02374 previndex = curindex; 02375 prevval = curval; 02376 } 02377 02378 numaDestroy(&nap); 02379 return nad; 02380 } 02381 02382 02383 /*! 02384 * numaEvalBestHaarParameters() 02385 * 02386 * Input: nas (numa of non-negative signal values) 02387 * relweight (relative weight of (-1 comb) / (+1 comb) 02388 * contributions to the 'convolution'. In effect, 02389 * the convolution kernel is a comb consisting of 02390 * alternating +1 and -weight.) 02391 * nwidth (number of widths to consider) 02392 * nshift (number of shifts to consider for each width) 02393 * minwidth (smallest width to consider) 02394 * maxwidth (largest width to consider) 02395 * &bestwidth (<return> width giving largest score) 02396 * &bestshift (<return> shift giving largest score) 02397 * &bestscore (<optional return> convolution with 02398 * "Haar"-like comb) 02399 * Return: 0 if OK, 1 on error 02400 * 02401 * Notes: 02402 * (1) This does a linear sweep of widths, evaluating at @nshift 02403 * shifts for each width, computing the score from a convolution 02404 * with a long comb, and finding the (width, shift) pair that 02405 * gives the maximum score. The best width is the "half-wavelength" 02406 * of the signal. 02407 * (2) The convolving function is a comb of alternating values 02408 * +1 and -1 * relweight, separated by the width and phased by 02409 * the shift. This is similar to a Haar transform, except 02410 * there the convolution is performed with a square wave. 02411 * (3) The function is useful for finding the line spacing 02412 * and strength of line signal from pixel sum projections. 02413 * (4) The score is normalized to the size of nas divided by 02414 * the number of half-widths. For image applications, the input is 02415 * typically an array of pixel projections, so one should 02416 * normalize by dividing the score by the image width in the 02417 * pixel projection direction. 02418 */ 02419 l_int32 02420 numaEvalBestHaarParameters(NUMA *nas, 02421 l_float32 relweight, 02422 l_int32 nwidth, 02423 l_int32 nshift, 02424 l_float32 minwidth, 02425 l_float32 maxwidth, 02426 l_float32 *pbestwidth, 02427 l_float32 *pbestshift, 02428 l_float32 *pbestscore) 02429 { 02430 l_int32 i, j; 02431 l_float32 delwidth, delshift, width, shift, score; 02432 l_float32 bestwidth, bestshift, bestscore; 02433 02434 PROCNAME("numaEvalBestHaarParameters"); 02435 02436 if (!nas) 02437 return ERROR_INT("nas not defined", procName, 1); 02438 if (!pbestwidth || !pbestshift) 02439 return ERROR_INT("&bestwidth and &bestshift not defined", procName, 1); 02440 02441 bestscore = 0.0; 02442 delwidth = (maxwidth - minwidth) / (nwidth - 1.0); 02443 for (i = 0; i < nwidth; i++) { 02444 width = minwidth + delwidth * i; 02445 delshift = width / (l_float32)(nshift); 02446 for (j = 0; j < nshift; j++) { 02447 shift = j * delshift; 02448 numaEvalHaarSum(nas, width, shift, relweight, &score); 02449 if (score > bestscore) { 02450 bestscore = score; 02451 bestwidth = width; 02452 bestshift = shift; 02453 #if DEBUG_FREQUENCY 02454 fprintf(stderr, "width = %7.3f, shift = %7.3f, score = %7.3f\n", 02455 width, shift, score); 02456 #endif /* DEBUG_FREQUENCY */ 02457 } 02458 } 02459 } 02460 02461 *pbestwidth = bestwidth; 02462 *pbestshift = bestshift; 02463 if (pbestscore) 02464 *pbestscore = bestscore; 02465 return 0; 02466 } 02467 02468 02469 /*! 02470 * numaEvalHaarSum() 02471 * 02472 * Input: nas (numa of non-negative signal values) 02473 * width (distance between +1 and -1 in convolution comb) 02474 * shift (phase of the comb: location of first +1) 02475 * relweight (relative weight of (-1 comb) / (+1 comb) 02476 * contributions to the 'convolution'. In effect, 02477 * the convolution kernel is a comb consisting of 02478 * alternating +1 and -weight.) 02479 * &score (<return> convolution with "Haar"-like comb) 02480 * Return: 0 if OK, 1 on error 02481 * 02482 * Notes: 02483 * (1) This does a convolution with a comb of alternating values 02484 * +1 and -relweight, separated by the width and phased by the shift. 02485 * This is similar to a Haar transform, except that for Haar, 02486 * (1) the convolution kernel is symmetric about 0, so the 02487 * relweight is 1.0, and 02488 * (2) the convolution is performed with a square wave. 02489 * (2) The score is normalized to the size of nas divided by 02490 * twice the "width". For image applications, the input is 02491 * typically an array of pixel projections, so one should 02492 * normalize by dividing the score by the image width in the 02493 * pixel projection direction. 02494 * (3) To get a Haar-like result, use relweight = 1.0. For detecting 02495 * signals where you expect every other sample to be close to 02496 * zero, as with barcodes or filtered text lines, you can 02497 * use relweight > 1.0. 02498 */ 02499 l_int32 02500 numaEvalHaarSum(NUMA *nas, 02501 l_float32 width, 02502 l_float32 shift, 02503 l_float32 relweight, 02504 l_float32 *pscore) 02505 { 02506 l_int32 i, n, nsamp, index; 02507 l_float32 score, weight, val; 02508 02509 PROCNAME("numaEvalHaarSum"); 02510 02511 if (!pscore) 02512 return ERROR_INT("&score not defined", procName, 1); 02513 *pscore = 0.0; 02514 if (!nas) 02515 return ERROR_INT("nas not defined", procName, 1); 02516 if ((n = numaGetCount(nas)) < 2 * width) 02517 return ERROR_INT("nas size too small", procName, 1); 02518 02519 score = 0.0; 02520 nsamp = (l_int32)((n - shift) / width); 02521 for (i = 0; i < nsamp; i++) { 02522 index = (l_int32)(shift + i * width); 02523 weight = (i % 2) ? 1.0 : -1.0 * relweight; 02524 numaGetFValue(nas, index, &val); 02525 score += weight * val; 02526 } 02527 02528 *pscore = 2.0 * width * score / (l_float32)n; 02529 return 0; 02530 } 02531