Leptonica 1.68
C Image Processing Library
|
00001 /*====================================================================* 00002 - Copyright (C) 2001 Leptonica. All rights reserved. 00003 - This software is distributed in the hope that it will be 00004 - useful, but with NO WARRANTY OF ANY KIND. 00005 - No author or distributor accepts responsibility to anyone for the 00006 - consequences of using this software, or for whether it serves any 00007 - particular purpose or works at all, unless he or she says so in 00008 - writing. Everyone is granted permission to copy, modify and 00009 - redistribute this source code, for commercial or non-commercial 00010 - purposes, with the following restrictions: (1) the origin of this 00011 - source code must not be misrepresented; (2) modified versions must 00012 - be plainly marked as such; and (3) this notice may not be removed 00013 - or altered from any source or modified source distribution. 00014 *====================================================================*/ 00015 00016 00017 /* 00018 * sarray.c 00019 * 00020 * Create/Destroy/Copy 00021 * SARRAY *sarrayCreate() 00022 * SARRAY *sarrayCreateInitialized() 00023 * SARRAY *sarrayCreateWordsFromString() 00024 * SARRAY *sarrayCreateLinesFromString() 00025 * void *sarrayDestroy() 00026 * SARRAY *sarrayCopy() 00027 * SARRAY *sarrayClone() 00028 * 00029 * Add/Remove string 00030 * l_int32 sarrayAddString() 00031 * l_int32 sarrayExtendArray() 00032 * char *sarrayRemoveString() 00033 * l_int32 sarrayReplaceString() 00034 * l_int32 sarrayClear() 00035 * 00036 * Accessors 00037 * l_int32 sarrayGetCount() 00038 * char **sarrayGetArray() 00039 * char *sarrayGetString() 00040 * l_int32 sarrayGetRefcount() 00041 * l_int32 sarrayChangeRefcount() 00042 * 00043 * Conversion back to string 00044 * char *sarrayToString() 00045 * char *sarrayToStringRange() 00046 * 00047 * Concatenate 2 sarrays 00048 * l_int32 sarrayConcatenate() 00049 * l_int32 sarrayAppendRange() 00050 * 00051 * Pad an sarray to be the same size as another sarray 00052 * l_int32 sarrayPadToSameSize() 00053 * 00054 * Convert word sarray to (formatted) line sarray 00055 * SARRAY *sarrayConvertWordsToLines() 00056 * 00057 * Split string on separator list 00058 * SARRAY *sarraySplitString() 00059 * 00060 * Filter sarray 00061 * SARRAY *sarraySelectBySubstring() 00062 * SARRAY *sarraySelectByRange() 00063 * l_int32 sarrayParseRange() 00064 * 00065 * Sort 00066 * SARRAY *sarraySort() 00067 * l_int32 stringCompareLexical() 00068 * 00069 * Serialize for I/O 00070 * SARRAY *sarrayRead() 00071 * SARRAY *sarrayReadStream() 00072 * l_int32 sarrayWrite() 00073 * l_int32 sarrayWriteStream() 00074 * l_int32 sarrayAppend() 00075 * 00076 * Directory filenames 00077 * SARRAY *getNumberedPathnamesInDirectory() 00078 * SARRAY *getSortedPathnamesInDirectory() 00079 * SARRAY *getFilenamesInDirectory() 00080 * 00081 * Comments on usage: 00082 * 00083 * These functions are important for efficient manipulation 00084 * of string data. They have been used in leptonica for 00085 * generating and parsing text files, and for generating 00086 * code for compilation. The user is responsible for 00087 * correctly disposing of strings that have been extracted 00088 * from sarrays. 00089 * 00090 * - When you want a string from an Sarray to inspect it, or 00091 * plan to make a copy of it later, use sarrayGetString() 00092 * with copyflag = 0. In this case, you must neither free 00093 * the string nor put it directly in another array. 00094 * We provide the copyflag constant L_NOCOPY, which is 0, 00095 * for this purpose: 00096 * str-not-owned = sarrayGetString(sa, index, L_NOCOPY); 00097 * To extract a copy of a string, use: 00098 * str-owned = sarrayGetString(sa, index, L_COPY); 00099 * 00100 * - When you want to insert a string that is in one 00101 * array into another array (always leaving the first 00102 * array intact), you have two options: 00103 * (1) use copyflag = L_COPY to make an immediate copy, 00104 * which you must then add to the second array 00105 * by insertion; namely, 00106 * str-owned = sarrayGetString(sa, index, L_COPY); 00107 * sarrayAddString(sa, str-owned, L_INSERT); 00108 * (2) use copyflag = L_NOCOPY to get another handle to 00109 * the string, in which case you must add 00110 * a copy of it to the second string array: 00111 * str-not-owned = sarrayGetString(sa, index, L_NOCOPY); 00112 * sarrayAddString(sa, str-not-owned, L_COPY). 00113 * 00114 * In all cases, when you use copyflag = L_COPY to extract 00115 * a string from an array, you must either free it 00116 * or insert it in an array that will be freed later. 00117 */ 00118 00119 #include <string.h> 00120 #ifndef _WIN32 00121 #include <dirent.h> /* unix only */ 00122 #endif /* ! _WIN32 */ 00123 #include "allheaders.h" 00124 00125 static const l_int32 INITIAL_PTR_ARRAYSIZE = 50; /* n'importe quoi */ 00126 static const l_int32 L_BUF_SIZE = 512; 00127 00128 00129 /*--------------------------------------------------------------------------* 00130 * String array create/destroy/copy/extend * 00131 *--------------------------------------------------------------------------*/ 00132 /*! 00133 * sarrayCreate() 00134 * 00135 * Input: size of string ptr array to be alloc'd 00136 * (use 0 for default) 00137 * Return: sarray, or null on error 00138 */ 00139 SARRAY * 00140 sarrayCreate(l_int32 n) 00141 { 00142 SARRAY *sa; 00143 00144 PROCNAME("sarrayCreate"); 00145 00146 if (n <= 0) 00147 n = INITIAL_PTR_ARRAYSIZE; 00148 00149 if ((sa = (SARRAY *)CALLOC(1, sizeof(SARRAY))) == NULL) 00150 return (SARRAY *)ERROR_PTR("sa not made", procName, NULL); 00151 if ((sa->array = (char **)CALLOC(n, sizeof(char *))) == NULL) 00152 return (SARRAY *)ERROR_PTR("ptr array not made", procName, NULL); 00153 00154 sa->nalloc = n; 00155 sa->n = 0; 00156 sa->refcount = 1; 00157 return sa; 00158 } 00159 00160 00161 /*! 00162 * sarrayCreateInitialized() 00163 * 00164 * Input: n (size of string ptr array to be alloc'd) 00165 * initstr (string to be initialized on the full array) 00166 * Return: sarray, or null on error 00167 */ 00168 SARRAY * 00169 sarrayCreateInitialized(l_int32 n, 00170 char *initstr) 00171 { 00172 l_int32 i; 00173 SARRAY *sa; 00174 00175 PROCNAME("sarrayCreateInitialized"); 00176 00177 if (n <= 0) 00178 return (SARRAY *)ERROR_PTR("n must be > 0", procName, NULL); 00179 if (!initstr) 00180 return (SARRAY *)ERROR_PTR("initstr not defined", procName, NULL); 00181 00182 sa = sarrayCreate(n); 00183 for (i = 0; i < n; i++) 00184 sarrayAddString(sa, initstr, L_COPY); 00185 return sa; 00186 } 00187 00188 00189 /*! 00190 * sarrayCreateWordsFromString() 00191 * 00192 * Input: string 00193 * Return: sarray, or null on error 00194 * 00195 * Notes: 00196 * (1) This finds the number of word substrings, creates an sarray 00197 * of this size, and puts copies of each substring into the sarray. 00198 */ 00199 SARRAY * 00200 sarrayCreateWordsFromString(const char *string) 00201 { 00202 char separators[] = " \n\t"; 00203 l_int32 i, nsub, size, inword; 00204 SARRAY *sa; 00205 00206 PROCNAME("sarrayCreateWordsFromString"); 00207 00208 if (!string) 00209 return (SARRAY *)ERROR_PTR("textstr not defined", procName, NULL); 00210 00211 /* Find the number of words */ 00212 size = strlen(string); 00213 nsub = 0; 00214 inword = FALSE; 00215 for (i = 0; i < size; i++) { 00216 if (inword == FALSE && 00217 (string[i] != ' ' && string[i] != '\t' && string[i] != '\n')) { 00218 inword = TRUE; 00219 nsub++; 00220 } 00221 else if (inword == TRUE && 00222 (string[i] == ' ' || string[i] == '\t' || string[i] == '\n')) { 00223 inword = FALSE; 00224 } 00225 } 00226 00227 if ((sa = sarrayCreate(nsub)) == NULL) 00228 return (SARRAY *)ERROR_PTR("sa not made", procName, NULL); 00229 sarraySplitString(sa, string, separators); 00230 00231 return sa; 00232 } 00233 00234 00235 /*! 00236 * sarrayCreateLinesFromString() 00237 * 00238 * Input: string 00239 * blankflag (0 to exclude blank lines; 1 to include) 00240 * Return: sarray, or null on error 00241 * 00242 * Notes: 00243 * (1) This finds the number of line substrings, each of which 00244 * ends with a newline, and puts a copy of each substring 00245 * in a new sarray. 00246 * (2) The newline characters are removed from each substring. 00247 */ 00248 SARRAY * 00249 sarrayCreateLinesFromString(char *string, 00250 l_int32 blankflag) 00251 { 00252 l_int32 i, nsub, size, startptr; 00253 char *cstring, *substring; 00254 SARRAY *sa; 00255 00256 PROCNAME("sarrayCreateLinesFromString"); 00257 00258 if (!string) 00259 return (SARRAY *)ERROR_PTR("textstr not defined", procName, NULL); 00260 00261 /* find the number of lines */ 00262 size = strlen(string); 00263 nsub = 0; 00264 for (i = 0; i < size; i++) { 00265 if (string[i] == '\n') 00266 nsub++; 00267 } 00268 00269 if ((sa = sarrayCreate(nsub)) == NULL) 00270 return (SARRAY *)ERROR_PTR("sa not made", procName, NULL); 00271 00272 if (blankflag) { /* keep blank lines as null strings */ 00273 /* Make a copy for munging */ 00274 if ((cstring = stringNew(string)) == NULL) 00275 return (SARRAY *)ERROR_PTR("cstring not made", procName, NULL); 00276 /* We'll insert nulls like strtok */ 00277 startptr = 0; 00278 for (i = 0; i < size; i++) { 00279 if (cstring[i] == '\n') { 00280 cstring[i] = '\0'; 00281 if (i > 0 && cstring[i - 1] == '\r') 00282 cstring[i - 1] = '\0'; /* also remove Windows CR */ 00283 if ((substring = stringNew(cstring + startptr)) == NULL) 00284 return (SARRAY *)ERROR_PTR("substring not made", 00285 procName, NULL); 00286 sarrayAddString(sa, substring, L_INSERT); 00287 /* fprintf(stderr, "substring = %s\n", substring); */ 00288 startptr = i + 1; 00289 } 00290 } 00291 if (startptr < size) { /* no newline at end of last line */ 00292 if ((substring = stringNew(cstring + startptr)) == NULL) 00293 return (SARRAY *)ERROR_PTR("substring not made", 00294 procName, NULL); 00295 sarrayAddString(sa, substring, L_INSERT); 00296 /* fprintf(stderr, "substring = %s\n", substring); */ 00297 } 00298 FREE(cstring); 00299 } 00300 else { /* remove blank lines; use strtok */ 00301 sarraySplitString(sa, string, "\r\n"); 00302 } 00303 00304 return sa; 00305 } 00306 00307 00308 /*! 00309 * sarrayDestroy() 00310 * 00311 * Input: &sarray <to be nulled> 00312 * Return: void 00313 * 00314 * Notes: 00315 * (1) Decrements the ref count and, if 0, destroys the sarray. 00316 * (2) Always nulls the input ptr. 00317 */ 00318 void 00319 sarrayDestroy(SARRAY **psa) 00320 { 00321 l_int32 i; 00322 SARRAY *sa; 00323 00324 PROCNAME("sarrayDestroy"); 00325 00326 if (psa == NULL) { 00327 L_WARNING("ptr address is NULL!", procName); 00328 return; 00329 } 00330 if ((sa = *psa) == NULL) 00331 return; 00332 00333 sarrayChangeRefcount(sa, -1); 00334 if (sarrayGetRefcount(sa) <= 0) { 00335 if (sa->array) { 00336 for (i = 0; i < sa->n; i++) { 00337 if (sa->array[i]) 00338 FREE(sa->array[i]); 00339 } 00340 FREE(sa->array); 00341 } 00342 FREE(sa); 00343 } 00344 00345 *psa = NULL; 00346 return; 00347 } 00348 00349 00350 /*! 00351 * sarrayCopy() 00352 * 00353 * Input: sarray 00354 * Return: copy of sarray, or null on error 00355 */ 00356 SARRAY * 00357 sarrayCopy(SARRAY *sa) 00358 { 00359 l_int32 i; 00360 SARRAY *csa; 00361 00362 PROCNAME("sarrayCopy"); 00363 00364 if (!sa) 00365 return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL); 00366 00367 if ((csa = sarrayCreate(sa->nalloc)) == NULL) 00368 return (SARRAY *)ERROR_PTR("csa not made", procName, NULL); 00369 00370 for (i = 0; i < sa->n; i++) 00371 sarrayAddString(csa, sa->array[i], L_COPY); 00372 00373 return csa; 00374 } 00375 00376 00377 /*! 00378 * sarrayClone() 00379 * 00380 * Input: sarray 00381 * Return: ptr to same sarray, or null on error 00382 */ 00383 SARRAY * 00384 sarrayClone(SARRAY *sa) 00385 { 00386 PROCNAME("sarrayClone"); 00387 00388 if (!sa) 00389 return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL); 00390 sarrayChangeRefcount(sa, 1); 00391 return sa; 00392 } 00393 00394 00395 /*! 00396 * sarrayAddString() 00397 * 00398 * Input: sarray 00399 * string (string to be added) 00400 * copyflag (L_INSERT, L_COPY) 00401 * Return: 0 if OK, 1 on error 00402 * 00403 * Notes: 00404 * (1) Legacy usage decrees that we always use 0 to insert a string 00405 * directly and 1 to insert a copy of the string. The 00406 * enums for L_INSERT and L_COPY agree with this convention, 00407 * and will not change in the future. 00408 * (2) See usage comments at the top of this file. 00409 */ 00410 l_int32 00411 sarrayAddString(SARRAY *sa, 00412 char *string, 00413 l_int32 copyflag) 00414 { 00415 l_int32 n; 00416 00417 PROCNAME("sarrayAddString"); 00418 00419 if (!sa) 00420 return ERROR_INT("sa not defined", procName, 1); 00421 if (!string) 00422 return ERROR_INT("string not defined", procName, 1); 00423 if (copyflag != L_INSERT && copyflag != L_COPY) 00424 return ERROR_INT("invalid copyflag", procName, 1); 00425 00426 n = sarrayGetCount(sa); 00427 if (n >= sa->nalloc) 00428 sarrayExtendArray(sa); 00429 00430 if (copyflag == L_INSERT) 00431 sa->array[n] = string; 00432 else /* L_COPY */ 00433 sa->array[n] = stringNew(string); 00434 sa->n++; 00435 00436 return 0; 00437 } 00438 00439 00440 /*! 00441 * sarrayExtendArray() 00442 * 00443 * Input: sarray 00444 * Return: 0 if OK, 1 on error 00445 */ 00446 l_int32 00447 sarrayExtendArray(SARRAY *sa) 00448 { 00449 PROCNAME("sarrayExtendArray"); 00450 00451 if (!sa) 00452 return ERROR_INT("sa not defined", procName, 1); 00453 00454 if ((sa->array = (char **)reallocNew((void **)&sa->array, 00455 sizeof(char *) * sa->nalloc, 00456 2 * sizeof(char *) * sa->nalloc)) == NULL) 00457 return ERROR_INT("new ptr array not returned", procName, 1); 00458 00459 sa->nalloc *= 2; 00460 return 0; 00461 } 00462 00463 00464 /*! 00465 * sarrayRemoveString() 00466 * 00467 * Input: sarray 00468 * index (of string within sarray) 00469 * Return: removed string, or null on error 00470 */ 00471 char * 00472 sarrayRemoveString(SARRAY *sa, 00473 l_int32 index) 00474 { 00475 char *string; 00476 char **array; 00477 l_int32 i, n, nalloc; 00478 00479 PROCNAME("sarrayRemoveString"); 00480 00481 if (!sa) 00482 return (char *)ERROR_PTR("sa not defined", procName, NULL); 00483 00484 if ((array = sarrayGetArray(sa, &nalloc, &n)) == NULL) 00485 return (char *)ERROR_PTR("array not returned", procName, NULL); 00486 00487 if (index < 0 || index >= n) 00488 return (char *)ERROR_PTR("array index out of bounds", procName, NULL); 00489 00490 string = array[index]; 00491 00492 /* If removed string is not at end of array, shift 00493 * to fill in, maintaining original ordering. 00494 * Note: if we didn't care about the order, we could 00495 * put the last string array[n - 1] directly into the hole. */ 00496 for (i = index; i < n - 1; i++) 00497 array[i] = array[i + 1]; 00498 00499 sa->n--; 00500 return string; 00501 } 00502 00503 00504 /*! 00505 * sarrayReplaceString() 00506 * 00507 * Input: sarray 00508 * index (of string within sarray to be replaced) 00509 * newstr (string to replace existing one) 00510 * copyflag (L_INSERT, L_COPY) 00511 * Return: 0 if OK, 1 on error 00512 * 00513 * Notes: 00514 * (1) This destroys an existing string and replaces it with 00515 * the new string or a copy of it. 00516 * (2) By design, an sarray is always compacted, so there are 00517 * never any holes (null ptrs) in the ptr array up to the 00518 * current count. 00519 */ 00520 l_int32 00521 sarrayReplaceString(SARRAY *sa, 00522 l_int32 index, 00523 char *newstr, 00524 l_int32 copyflag) 00525 { 00526 char *str; 00527 l_int32 n; 00528 00529 PROCNAME("sarrayReplaceString"); 00530 00531 if (!sa) 00532 return ERROR_INT("sa not defined", procName, 1); 00533 n = sarrayGetCount(sa); 00534 if (index < 0 || index >= n) 00535 return ERROR_INT("array index out of bounds", procName, 1); 00536 if (!newstr) 00537 return ERROR_INT("newstr not defined", procName, 1); 00538 if (copyflag != L_INSERT && copyflag != L_COPY) 00539 return ERROR_INT("invalid copyflag", procName, 1); 00540 00541 FREE(sa->array[index]); 00542 if (copyflag == L_INSERT) 00543 str = newstr; 00544 else /* L_COPY */ 00545 str = stringNew(newstr); 00546 sa->array[index] = str; 00547 return 0; 00548 } 00549 00550 00551 /*! 00552 * sarrayClear() 00553 * 00554 * Input: sarray 00555 * Return: 0 if OK; 1 on error 00556 */ 00557 l_int32 00558 sarrayClear(SARRAY *sa) 00559 { 00560 l_int32 i; 00561 00562 PROCNAME("sarrayClear"); 00563 00564 if (!sa) 00565 return ERROR_INT("sa not defined", procName, 1); 00566 for (i = 0; i < sa->n; i++) { /* free strings and null ptrs */ 00567 FREE(sa->array[i]); 00568 sa->array[i] = NULL; 00569 } 00570 sa->n = 0; 00571 return 0; 00572 } 00573 00574 00575 /*----------------------------------------------------------------------* 00576 * Accessors * 00577 *----------------------------------------------------------------------*/ 00578 /*! 00579 * sarrayGetCount() 00580 * 00581 * Input: sarray 00582 * Return: count, or 0 if no strings or on error 00583 */ 00584 l_int32 00585 sarrayGetCount(SARRAY *sa) 00586 { 00587 PROCNAME("sarrayGetCount"); 00588 00589 if (!sa) 00590 return ERROR_INT("sa not defined", procName, 0); 00591 return sa->n; 00592 } 00593 00594 00595 /*! 00596 * sarrayGetArray() 00597 * 00598 * Input: sarray 00599 * &nalloc (<optional return> number allocated string ptrs) 00600 * &n (<optional return> number allocated strings) 00601 * Return: ptr to string array, or null on error 00602 * 00603 * Notes: 00604 * (1) Caution: the returned array is not a copy, so caller 00605 * must not destroy it! 00606 */ 00607 char ** 00608 sarrayGetArray(SARRAY *sa, 00609 l_int32 *pnalloc, 00610 l_int32 *pn) 00611 { 00612 char **array; 00613 00614 PROCNAME("sarrayGetArray"); 00615 00616 if (!sa) 00617 return (char **)ERROR_PTR("sa not defined", procName, NULL); 00618 00619 array = sa->array; 00620 if (pnalloc) *pnalloc = sa->nalloc; 00621 if (pn) *pn = sa->n; 00622 00623 return array; 00624 } 00625 00626 00627 /*! 00628 * sarrayGetString() 00629 * 00630 * Input: sarray 00631 * index (to the index-th string) 00632 * copyflag (L_NOCOPY or L_COPY) 00633 * Return: string, or null on error 00634 * 00635 * Notes: 00636 * (1) Legacy usage decrees that we always use 0 to get the 00637 * pointer to the string itself, and 1 to get a copy of 00638 * the string. 00639 * (2) See usage comments at the top of this file. 00640 * (3) To get a pointer to the string itself, use for copyflag: 00641 * L_NOCOPY or 0 or FALSE 00642 * To get a copy of the string, use for copyflag: 00643 * L_COPY or 1 or TRUE 00644 * The const values of L_NOCOPY and L_COPY are guaranteed not 00645 * to change. 00646 */ 00647 char * 00648 sarrayGetString(SARRAY *sa, 00649 l_int32 index, 00650 l_int32 copyflag) 00651 { 00652 PROCNAME("sarrayGetString"); 00653 00654 if (!sa) 00655 return (char *)ERROR_PTR("sa not defined", procName, NULL); 00656 if (index < 0 || index >= sa->n) 00657 return (char *)ERROR_PTR("index not valid", procName, NULL); 00658 if (copyflag != L_NOCOPY && copyflag != L_COPY) 00659 return (char *)ERROR_PTR("invalid copyflag", procName, NULL); 00660 00661 if (copyflag == L_NOCOPY) 00662 return sa->array[index]; 00663 else /* L_COPY */ 00664 return stringNew(sa->array[index]); 00665 } 00666 00667 00668 /*! 00669 * sarrayGetRefCount() 00670 * 00671 * Input: sarray 00672 * Return: refcount, or UNDEF on error 00673 */ 00674 l_int32 00675 sarrayGetRefcount(SARRAY *sa) 00676 { 00677 PROCNAME("sarrayGetRefcount"); 00678 00679 if (!sa) 00680 return ERROR_INT("sa not defined", procName, UNDEF); 00681 return sa->refcount; 00682 } 00683 00684 00685 /*! 00686 * sarrayChangeRefCount() 00687 * 00688 * Input: sarray 00689 * delta (change to be applied) 00690 * Return: 0 if OK, 1 on error 00691 */ 00692 l_int32 00693 sarrayChangeRefcount(SARRAY *sa, 00694 l_int32 delta) 00695 { 00696 PROCNAME("sarrayChangeRefcount"); 00697 00698 if (!sa) 00699 return ERROR_INT("sa not defined", procName, UNDEF); 00700 sa->refcount += delta; 00701 return 0; 00702 } 00703 00704 00705 /*----------------------------------------------------------------------* 00706 * Conversion to string * 00707 *----------------------------------------------------------------------*/ 00708 /*! 00709 * sarrayToString() 00710 * 00711 * Input: sarray 00712 * addnlflag (flag: 0 adds nothing to each substring 00713 * 1 adds '\n' to each substring 00714 * 2 adds ' ' to each substring) 00715 * Return: dest string, or null on error 00716 * 00717 * Notes: 00718 * (1) Concatenates all the strings in the sarray, preserving 00719 * all white space. 00720 * (2) If addnlflag != 0, adds either a '\n' or a ' ' after 00721 * each substring. 00722 * (3) This function was NOT implemented as: 00723 * for (i = 0; i < n; i++) 00724 * strcat(dest, sarrayGetString(sa, i, L_NOCOPY)); 00725 * Do you see why? 00726 */ 00727 char * 00728 sarrayToString(SARRAY *sa, 00729 l_int32 addnlflag) 00730 { 00731 PROCNAME("sarrayToString"); 00732 00733 if (!sa) 00734 return (char *)ERROR_PTR("sa not defined", procName, NULL); 00735 00736 return sarrayToStringRange(sa, 0, 0, addnlflag); 00737 } 00738 00739 00740 /*! 00741 * sarrayToStringRange() 00742 * 00743 * Input: sarray 00744 * first (index of first string to use; starts with 0) 00745 * nstrings (number of strings to append into the result; use 00746 * 0 to append to the end of the sarray) 00747 * addnlflag (flag: 0 adds nothing to each substring 00748 * 1 adds '\n' to each substring 00749 * 2 adds ' ' to each substring) 00750 * Return: dest string, or null on error 00751 * 00752 * Notes: 00753 * (1) Concatenates the specified strings inthe sarray, preserving 00754 * all white space. 00755 * (2) If addnlflag != 0, adds either a '\n' or a ' ' after 00756 * each substring. 00757 * (3) If the sarray is empty, this returns a string with just 00758 * the character corresponding to @addnlflag. 00759 */ 00760 char * 00761 sarrayToStringRange(SARRAY *sa, 00762 l_int32 first, 00763 l_int32 nstrings, 00764 l_int32 addnlflag) 00765 { 00766 char *dest, *src, *str; 00767 l_int32 n, i, last, size, index, len; 00768 00769 PROCNAME("sarrayToStringRange"); 00770 00771 if (!sa) 00772 return (char *)ERROR_PTR("sa not defined", procName, NULL); 00773 if (addnlflag != 0 && addnlflag != 1 && addnlflag != 2) 00774 return (char *)ERROR_PTR("invalid addnlflag", procName, NULL); 00775 00776 n = sarrayGetCount(sa); 00777 00778 /* Empty sa; return char corresponding to addnlflag only */ 00779 if (n == 0) { 00780 if (first == 0) { 00781 if (addnlflag == 0) 00782 return stringNew(""); 00783 if (addnlflag == 1) 00784 return stringNew("\n"); 00785 else /* addnlflag == 2) */ 00786 return stringNew(" "); 00787 } 00788 else 00789 return (char *)ERROR_PTR("first not valid", procName, NULL); 00790 } 00791 00792 if (first < 0 || first >= n) 00793 return (char *)ERROR_PTR("first not valid", procName, NULL); 00794 if (nstrings == 0 || (nstrings > n - first)) 00795 nstrings = n - first; /* no overflow */ 00796 last = first + nstrings - 1; 00797 00798 size = 0; 00799 for (i = first; i <= last; i++) { 00800 if ((str = sarrayGetString(sa, i, L_NOCOPY)) == NULL) 00801 return (char *)ERROR_PTR("str not found", procName, NULL); 00802 size += strlen(str) + 2; 00803 } 00804 00805 if ((dest = (char *)CALLOC(size + 1, sizeof(char))) == NULL) 00806 return (char *)ERROR_PTR("dest not made", procName, NULL); 00807 00808 index = 0; 00809 for (i = first; i <= last; i++) { 00810 src = sarrayGetString(sa, i, L_NOCOPY); 00811 len = strlen(src); 00812 memcpy(dest + index, src, len); 00813 index += len; 00814 if (addnlflag == 1) { 00815 dest[index] = '\n'; 00816 index++; 00817 } 00818 else if (addnlflag == 2) { 00819 dest[index] = ' '; 00820 index++; 00821 } 00822 } 00823 00824 return dest; 00825 } 00826 00827 00828 /*----------------------------------------------------------------------* 00829 * Concatenate 2 sarrays * 00830 *----------------------------------------------------------------------*/ 00831 /*! 00832 * sarrayConcatenate() 00833 * 00834 * Input: sa1 (to be added to) 00835 * sa2 (append to sa1) 00836 * Return: 0 if OK, 1 on error 00837 * 00838 * Notes: 00839 * (1) Copies of the strings in sarray2 are added to sarray1. 00840 */ 00841 l_int32 00842 sarrayConcatenate(SARRAY *sa1, 00843 SARRAY *sa2) 00844 { 00845 char *str; 00846 l_int32 n, i; 00847 00848 PROCNAME("sarrayConcatenate"); 00849 00850 if (!sa1) 00851 return ERROR_INT("sa1 not defined", procName, 1); 00852 if (!sa2) 00853 return ERROR_INT("sa2 not defined", procName, 1); 00854 00855 n = sarrayGetCount(sa2); 00856 for (i = 0; i < n; i++) { 00857 str = sarrayGetString(sa2, i, L_NOCOPY); 00858 sarrayAddString(sa1, str, L_COPY); 00859 } 00860 00861 return 0; 00862 } 00863 00864 00865 /*! 00866 * sarrayAppendRange() 00867 * 00868 * Input: sa1 (to be added to) 00869 * sa2 (append specified range of strings in sa2 to sa1) 00870 * start (index of first string of sa2 to append) 00871 * end (index of last string of sa2 to append) 00872 * Return: 0 if OK, 1 on error 00873 * 00874 * Notes: 00875 * (1) Copies of the strings in sarray2 are added to sarray1. 00876 * (2) The [start ... end] range is truncated if necessary. 00877 */ 00878 l_int32 00879 sarrayAppendRange(SARRAY *sa1, 00880 SARRAY *sa2, 00881 l_int32 start, 00882 l_int32 end) 00883 { 00884 char *str; 00885 l_int32 n, i; 00886 00887 PROCNAME("sarrayAppendRange"); 00888 00889 if (!sa1) 00890 return ERROR_INT("sa1 not defined", procName, 1); 00891 if (!sa2) 00892 return ERROR_INT("sa2 not defined", procName, 1); 00893 if (start < 0) 00894 start = 0; 00895 n = sarrayGetCount(sa2); 00896 if (end >= n) 00897 end = n - 1; 00898 if (start > end) 00899 return ERROR_INT("start > end", procName, 1); 00900 00901 for (i = start; i <= end; i++) { 00902 str = sarrayGetString(sa2, i, L_NOCOPY); 00903 sarrayAddString(sa1, str, L_COPY); 00904 } 00905 00906 return 0; 00907 } 00908 00909 00910 /*----------------------------------------------------------------------* 00911 * Pad an sarray to be the same size as another sarray * 00912 *----------------------------------------------------------------------*/ 00913 /*! 00914 * sarrayPadToSameSize() 00915 * 00916 * Input: sa1, sa2 00917 * padstring 00918 * Return: 0 if OK, 1 on error 00919 * 00920 * Notes: 00921 * (1) If two sarrays have different size, this adds enough 00922 * instances of @padstring to the smaller so that they are 00923 * the same size. It is useful when two or more sarrays 00924 * are being sequenced in parallel, and it is necessary to 00925 * find a valid string at each index. 00926 */ 00927 l_int32 00928 sarrayPadToSameSize(SARRAY *sa1, 00929 SARRAY *sa2, 00930 char *padstring) 00931 { 00932 l_int32 i, n1, n2; 00933 00934 PROCNAME("sarrayPadToSameSize"); 00935 00936 if (!sa1 || !sa2) 00937 return ERROR_INT("both sa1 and sa2 not defined", procName, 1); 00938 00939 n1 = sarrayGetCount(sa1); 00940 n2 = sarrayGetCount(sa2); 00941 if (n1 < n2) { 00942 for (i = n1; i < n2; i++) 00943 sarrayAddString(sa1, padstring, L_COPY); 00944 } 00945 else if (n1 > n2) { 00946 for (i = n2; i < n1; i++) 00947 sarrayAddString(sa2, padstring, L_COPY); 00948 } 00949 00950 return 0; 00951 } 00952 00953 00954 /*----------------------------------------------------------------------* 00955 * Convert word sarray to line sarray * 00956 *----------------------------------------------------------------------*/ 00957 /*! 00958 * sarrayConvertWordsToLines() 00959 * 00960 * Input: sa (sa of individual words) 00961 * linesize (max num of chars in each line) 00962 * Return: saout (sa of formatted lines), or null on error 00963 * 00964 * This is useful for re-typesetting text to a specific maximum 00965 * line length. The individual words in the input sarray 00966 * are concatenated into textlines. An input word string of zero 00967 * length is taken to be a paragraph separator. Each time 00968 * such a string is found, the current line is ended and 00969 * a new line is also produced that contains just the 00970 * string of zero length (""). When the output sarray 00971 * of lines is eventually converted to a string with newlines 00972 * (typically) appended to each line string, the empty 00973 * strings are just converted to newlines, producing the visible 00974 * paragraph separation. 00975 * 00976 * What happens when a word is larger than linesize? 00977 * We write it out as a single line anyway! Words preceding 00978 * or following this long word are placed on lines preceding 00979 * or following the line with the long word. Why this choice? 00980 * Long "words" found in text documents are typically URLs, and 00981 * it's often desirable not to put newlines in the middle of a URL. 00982 * The text display program (e.g., text editor) will typically 00983 * wrap the long "word" to fit in the window. 00984 */ 00985 SARRAY * 00986 sarrayConvertWordsToLines(SARRAY *sa, 00987 l_int32 linesize) 00988 { 00989 char *wd, *strl; 00990 char emptystring[] = ""; 00991 l_int32 n, i, len, totlen; 00992 SARRAY *sal, *saout; 00993 00994 PROCNAME("sarrayConvertWordsToLines"); 00995 00996 if (!sa) 00997 return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL); 00998 00999 if ((saout = sarrayCreate(0)) == NULL) 01000 return (SARRAY *)ERROR_PTR("saout not defined", procName, NULL); 01001 01002 n = sarrayGetCount(sa); 01003 totlen = 0; 01004 sal = NULL; 01005 for (i = 0; i < n; i++) { 01006 if (!sal) { 01007 if ((sal = sarrayCreate(0)) == NULL) 01008 return (SARRAY *)ERROR_PTR("sal not made", procName, NULL); 01009 } 01010 wd = sarrayGetString(sa, i, L_NOCOPY); 01011 len = strlen(wd); 01012 if (len == 0) { /* end of paragraph: end line & insert blank line */ 01013 if (totlen > 0) { 01014 strl = sarrayToString(sal, 2); 01015 sarrayAddString(saout, strl, L_INSERT); 01016 } 01017 sarrayAddString(saout, emptystring, L_COPY); 01018 sarrayDestroy(&sal); 01019 totlen = 0; 01020 } 01021 else if (totlen == 0 && len + 1 > linesize) { /* long word! */ 01022 sarrayAddString(saout, wd, L_COPY); /* copy to one line */ 01023 } 01024 else if (totlen + len + 1 > linesize) { /* end line & start new one */ 01025 strl = sarrayToString(sal, 2); 01026 sarrayAddString(saout, strl, L_INSERT); 01027 sarrayDestroy(&sal); 01028 if ((sal = sarrayCreate(0)) == NULL) 01029 return (SARRAY *)ERROR_PTR("sal not made", procName, NULL); 01030 sarrayAddString(sal, wd, L_COPY); 01031 totlen = len + 1; 01032 } 01033 else { /* add to current line */ 01034 sarrayAddString(sal, wd, L_COPY); 01035 totlen += len + 1; 01036 } 01037 } 01038 if (totlen > 0) { /* didn't end with blank line; output last line */ 01039 strl = sarrayToString(sal, 2); 01040 sarrayAddString(saout, strl, L_INSERT); 01041 sarrayDestroy(&sal); 01042 } 01043 01044 return saout; 01045 01046 } 01047 01048 01049 /*----------------------------------------------------------------------* 01050 * Split string on separator list * 01051 *----------------------------------------------------------------------*/ 01052 /* 01053 * sarraySplitString() 01054 * 01055 * Input: sa (to append to; typically empty initially) 01056 * str (string to split; not changed) 01057 * separators (characters that split input string) 01058 * Return: 0 if OK, 1 on error. 01059 * 01060 * Notes: 01061 * (1) This uses strtokSafe(). See the notes there in utils.c. 01062 */ 01063 l_int32 01064 sarraySplitString(SARRAY *sa, 01065 const char *str, 01066 const char *separators) 01067 { 01068 char *cstr, *substr, *saveptr; 01069 01070 PROCNAME("sarraySplitString"); 01071 01072 if (!sa) 01073 return ERROR_INT("sa not defined", procName, 1); 01074 if (!str) 01075 return ERROR_INT("str not defined", procName, 1); 01076 if (!separators) 01077 return ERROR_INT("separators not defined", procName, 1); 01078 01079 cstr = stringNew(str); /* preserves const-ness of input str */ 01080 substr = strtokSafe(cstr, separators, &saveptr); 01081 if (substr) 01082 sarrayAddString(sa, substr, L_INSERT); 01083 while ((substr = strtokSafe(NULL, separators, &saveptr))) 01084 sarrayAddString(sa, substr, L_INSERT); 01085 FREE(cstr); 01086 01087 return 0; 01088 } 01089 01090 01091 /*----------------------------------------------------------------------* 01092 * Filter sarray * 01093 *----------------------------------------------------------------------*/ 01094 /*! 01095 * sarraySelectBySubstring() 01096 * 01097 * Input: sain (input sarray) 01098 * substr (<optional> substring for matching; can be NULL) 01099 * Return: saout (output sarray, filtered with substring) or null on error 01100 * 01101 * Notes: 01102 * (1) This selects all strings in sain that have substr as a substring. 01103 * Note that we can't use strncmp() because we're looking for 01104 * a match to the substring anywhere within each filename. 01105 * (2) If substr == NULL, returns a copy of the sarray. 01106 */ 01107 SARRAY * 01108 sarraySelectBySubstring(SARRAY *sain, 01109 const char *substr) 01110 { 01111 char *str; 01112 l_int32 n, i, offset, found; 01113 SARRAY *saout; 01114 01115 PROCNAME("sarraySelectBySubstring"); 01116 01117 if (!sain) 01118 return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL); 01119 01120 n = sarrayGetCount(sain); 01121 if (!substr || n == 0) 01122 return sarrayCopy(sain); 01123 01124 saout = sarrayCreate(n); 01125 for (i = 0; i < n; i++) { 01126 str = sarrayGetString(sain, i, L_NOCOPY); 01127 arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr, 01128 strlen(substr), &offset, &found); 01129 if (found) 01130 sarrayAddString(saout, str, L_COPY); 01131 } 01132 01133 return saout; 01134 } 01135 01136 01137 /*! 01138 * sarraySelectByRange() 01139 * 01140 * Input: sain (input sarray) 01141 * first (index of first string to be selected) 01142 * last (index of last string to be selected; use 0 to go to the 01143 * end of the sarray) 01144 * Return: saout (output sarray), or null on error 01145 * 01146 * Notes: 01147 * (1) This makes @saout consisting of copies of all strings in @sain 01148 * in the index set [first ... last]. Use @last == 0 to get all 01149 * strings from @first to the last string in the sarray. 01150 */ 01151 SARRAY * 01152 sarraySelectByRange(SARRAY *sain, 01153 l_int32 first, 01154 l_int32 last) 01155 { 01156 char *str; 01157 l_int32 n, i; 01158 SARRAY *saout; 01159 01160 PROCNAME("sarraySelectByRange"); 01161 01162 if (!sain) 01163 return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL); 01164 if (first < 0) first = 0; 01165 n = sarrayGetCount(sain); 01166 if (last <= 0) last = n - 1; 01167 if (last >= n) { 01168 L_WARNING("@last > n - 1; setting to n - 1", procName); 01169 last = n - 1; 01170 } 01171 if (first > last) 01172 return (SARRAY *)ERROR_PTR("first must be >= last", procName, NULL); 01173 01174 saout = sarrayCreate(0); 01175 for (i = first; i <= last; i++) { 01176 str = sarrayGetString(sain, i, L_COPY); 01177 sarrayAddString(saout, str, L_INSERT); 01178 } 01179 01180 return saout; 01181 } 01182 01183 01184 /*! 01185 * sarrayParseRange() 01186 * 01187 * Input: sa (input sarray) 01188 * start (index to start range search) 01189 * &actualstart (<return> index of actual start; may be > 'start') 01190 * &end (<return> index of end) 01191 * &newstart (<return> index of start of next range) 01192 * substr (substring for matching at beginning of string) 01193 * loc (byte offset within the string for the pattern; use 01194 * -1 if the location does not matter); 01195 * Return: 0 if valid range found; 1 otherwise 01196 * 01197 * Notes: 01198 * (1) This finds the range of the next set of strings in SA, 01199 * beginning the search at 'start', that does NOT have 01200 * the substring 'substr' either at the indicated location 01201 * in the string or anywhere in the string. The input 01202 * variable 'loc' is the specified offset within the string; 01203 * use -1 to indicate 'anywhere in the string'. 01204 * (2) Always check the return value to verify that a valid range 01205 * was found. 01206 * (3) If a valid range is not found, the values of actstart, 01207 * end and newstart are all set to the size of sa. 01208 * (4) If this is the last valid range, newstart returns the value n. 01209 * In use, this should be tested before calling the function. 01210 * (5) Usage example. To find all the valid ranges in a file 01211 * where the invalid lines begin with two dashes, copy each 01212 * line in the file to a string in an sarray, and do: 01213 * start = 0; 01214 * while (!sarrayParseRange(sa, start, &actstart, &end, &start, 01215 * "--", 0)) 01216 * fprintf(stderr, "start = %d, end = %d\n", actstart, end); 01217 */ 01218 l_int32 01219 sarrayParseRange(SARRAY *sa, 01220 l_int32 start, 01221 l_int32 *pactualstart, 01222 l_int32 *pend, 01223 l_int32 *pnewstart, 01224 const char *substr, 01225 l_int32 loc) 01226 { 01227 char *str; 01228 l_int32 n, i, offset, found; 01229 01230 PROCNAME("sarrayParseRange"); 01231 01232 if (!sa) 01233 return ERROR_INT("sa not defined", procName, 1); 01234 if (!pactualstart || !pend || !pnewstart) 01235 return ERROR_INT("not all range addresses defined", procName, 1); 01236 n = sarrayGetCount(sa); 01237 *pactualstart = *pend = *pnewstart = n; 01238 if (!substr) 01239 return ERROR_INT("substr not defined", procName, 1); 01240 01241 /* Look for the first string without the marker */ 01242 if (start < 0 || start >= n) 01243 return 1; 01244 for (i = start; i < n; i++) { 01245 str = sarrayGetString(sa, i, L_NOCOPY); 01246 arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr, 01247 strlen(substr), &offset, &found); 01248 if (loc < 0) { 01249 if (!found) break; 01250 } else { 01251 if (!found || offset != loc) break; 01252 } 01253 } 01254 start = i; 01255 if (i == n) /* couldn't get started */ 01256 return 1; 01257 01258 /* Look for the last string without the marker */ 01259 *pactualstart = start; 01260 for (i = start + 1; i < n; i++) { 01261 str = sarrayGetString(sa, i, L_NOCOPY); 01262 arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr, 01263 strlen(substr), &offset, &found); 01264 if (loc < 0) { 01265 if (found) break; 01266 } else { 01267 if (found && offset == loc) break; 01268 } 01269 } 01270 *pend = i - 1; 01271 start = i; 01272 if (i == n) /* no further range */ 01273 return 0; 01274 01275 /* Look for the first string after *pend without the marker. 01276 * This will start the next run of strings, if it exists. */ 01277 for (i = start; i < n; i++) { 01278 str = sarrayGetString(sa, i, L_NOCOPY); 01279 arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr, 01280 strlen(substr), &offset, &found); 01281 if (loc < 0) { 01282 if (!found) break; 01283 } else { 01284 if (!found || offset != loc) break; 01285 } 01286 } 01287 if (i < n) 01288 *pnewstart = i; 01289 01290 return 0; 01291 } 01292 01293 01294 /*----------------------------------------------------------------------* 01295 * Sort * 01296 *----------------------------------------------------------------------*/ 01297 /*! 01298 * sarraySort() 01299 * 01300 * Input: saout (output sarray; can be NULL or equal to sain) 01301 * sain (input sarray) 01302 * sortorder (L_SORT_INCREASING or L_SORT_DECREASING) 01303 * Return: saout (output sarray, sorted by ascii value), or null on error 01304 * 01305 * Notes: 01306 * (1) Set saout = sain for in-place; otherwise, set naout = NULL. 01307 * (2) Shell sort, modified from K&R, 2nd edition, p.62. 01308 * Slow but simple O(n logn) sort. 01309 */ 01310 SARRAY * 01311 sarraySort(SARRAY *saout, 01312 SARRAY *sain, 01313 l_int32 sortorder) 01314 { 01315 char **array; 01316 char *tmp; 01317 l_int32 n, i, j, gap; 01318 01319 PROCNAME("sarraySort"); 01320 01321 if (!sain) 01322 return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL); 01323 01324 /* Make saout if necessary; otherwise do in-place */ 01325 if (!saout) 01326 saout = sarrayCopy(sain); 01327 else if (sain != saout) 01328 return (SARRAY *)ERROR_PTR("invalid: not in-place", procName, NULL); 01329 array = saout->array; /* operate directly on the array */ 01330 n = sarrayGetCount(saout); 01331 01332 /* Shell sort */ 01333 for (gap = n/2; gap > 0; gap = gap / 2) { 01334 for (i = gap; i < n; i++) { 01335 for (j = i - gap; j >= 0; j -= gap) { 01336 if ((sortorder == L_SORT_INCREASING && 01337 stringCompareLexical(array[j], array[j + gap])) || 01338 (sortorder == L_SORT_DECREASING && 01339 stringCompareLexical(array[j + gap], array[j]))) 01340 { 01341 tmp = array[j]; 01342 array[j] = array[j + gap]; 01343 array[j + gap] = tmp; 01344 } 01345 } 01346 } 01347 } 01348 01349 return saout; 01350 } 01351 01352 01353 /*! 01354 * stringCompareLexical() 01355 * 01356 * Input: str1 01357 * str2 01358 * Return: 1 if str1 > str2 (lexically); 0 otherwise 01359 * 01360 * Notes: 01361 * (1) If the lexical values are identical, return a 0, to 01362 * indicate that no swapping is required to sort the strings. 01363 */ 01364 l_int32 01365 stringCompareLexical(const char *str1, 01366 const char *str2) 01367 { 01368 l_int32 i, len1, len2, len; 01369 01370 PROCNAME("sarrayCompareLexical"); 01371 01372 if (!str1) 01373 return ERROR_INT("str1 not defined", procName, 1); 01374 if (!str2) 01375 return ERROR_INT("str2 not defined", procName, 1); 01376 01377 len1 = strlen(str1); 01378 len2 = strlen(str2); 01379 len = L_MIN(len1, len2); 01380 01381 for (i = 0; i < len; i++) { 01382 if (str1[i] == str2[i]) 01383 continue; 01384 if (str1[i] > str2[i]) 01385 return 1; 01386 else 01387 return 0; 01388 } 01389 01390 if (len1 > len2) 01391 return 1; 01392 else 01393 return 0; 01394 } 01395 01396 01397 /*----------------------------------------------------------------------* 01398 * Serialize for I/O * 01399 *----------------------------------------------------------------------*/ 01400 /*! 01401 * sarrayRead() 01402 * 01403 * Input: filename 01404 * Return: sarray, or null on error 01405 */ 01406 SARRAY * 01407 sarrayRead(const char *filename) 01408 { 01409 FILE *fp; 01410 SARRAY *sa; 01411 01412 PROCNAME("sarrayRead"); 01413 01414 if (!filename) 01415 return (SARRAY *)ERROR_PTR("filename not defined", procName, NULL); 01416 01417 if ((fp = fopenReadStream(filename)) == NULL) 01418 return (SARRAY *)ERROR_PTR("stream not opened", procName, NULL); 01419 01420 if ((sa = sarrayReadStream(fp)) == NULL) { 01421 fclose(fp); 01422 return (SARRAY *)ERROR_PTR("sa not read", procName, NULL); 01423 } 01424 01425 fclose(fp); 01426 return sa; 01427 } 01428 01429 01430 /*! 01431 * sarrayReadStream() 01432 * 01433 * Input: stream 01434 * Return: sarray, or null on error 01435 * 01436 * Notes: 01437 * (1) We store the size of each string along with the string. 01438 * (2) This allows a string to have embedded newlines. By reading 01439 * the entire string, as determined by its size, we are 01440 * not affected by any number of embedded newlines. 01441 */ 01442 SARRAY * 01443 sarrayReadStream(FILE *fp) 01444 { 01445 char *stringbuf; 01446 l_int32 i, n, size, index, bufsize, version, ignore; 01447 SARRAY *sa; 01448 01449 PROCNAME("sarrayReadStream"); 01450 01451 if (!fp) 01452 return (SARRAY *)ERROR_PTR("stream not defined", procName, NULL); 01453 01454 if (fscanf(fp, "\nSarray Version %d\n", &version) != 1) 01455 return (SARRAY *)ERROR_PTR("not an sarray file", procName, NULL); 01456 if (version != SARRAY_VERSION_NUMBER) 01457 return (SARRAY *)ERROR_PTR("invalid sarray version", procName, NULL); 01458 if (fscanf(fp, "Number of strings = %d\n", &n) != 1) 01459 return (SARRAY *)ERROR_PTR("error on # strings", procName, NULL); 01460 01461 if ((sa = sarrayCreate(n)) == NULL) 01462 return (SARRAY *)ERROR_PTR("sa not made", procName, NULL); 01463 bufsize = L_BUF_SIZE + 1; 01464 if ((stringbuf = (char *)CALLOC(bufsize, sizeof(char))) == NULL) 01465 return (SARRAY *)ERROR_PTR("stringbuf not made", procName, NULL); 01466 01467 for (i = 0; i < n; i++) { 01468 /* Get the size of the stored string */ 01469 if (fscanf(fp, "%d[%d]:", &index, &size) != 2) 01470 return (SARRAY *)ERROR_PTR("error on string size", procName, NULL); 01471 /* Expand the string buffer if necessary */ 01472 if (size > bufsize - 5) { 01473 FREE(stringbuf); 01474 bufsize = (l_int32)(1.5 * size); 01475 stringbuf = (char *)CALLOC(bufsize, sizeof(char)); 01476 } 01477 /* Read the stored string, plus leading spaces and trailing \n */ 01478 if (fread(stringbuf, 1, size + 3, fp) != size + 3) 01479 return (SARRAY *)ERROR_PTR("error reading string", procName, NULL); 01480 /* Remove the \n that was added by sarrayWriteStream() */ 01481 stringbuf[size + 2] = '\0'; 01482 /* Copy it in, skipping the 2 leading spaces */ 01483 sarrayAddString(sa, stringbuf + 2, L_COPY); 01484 } 01485 ignore = fscanf(fp, "\n"); 01486 01487 FREE(stringbuf); 01488 return sa; 01489 } 01490 01491 01492 /*! 01493 * sarrayWrite() 01494 * 01495 * Input: filename 01496 * sarray 01497 * Return: 0 if OK; 1 on error 01498 */ 01499 l_int32 01500 sarrayWrite(const char *filename, 01501 SARRAY *sa) 01502 { 01503 FILE *fp; 01504 01505 PROCNAME("sarrayWrite"); 01506 01507 if (!filename) 01508 return ERROR_INT("filename not defined", procName, 1); 01509 if (!sa) 01510 return ERROR_INT("sa not defined", procName, 1); 01511 01512 if ((fp = fopenWriteStream(filename, "w")) == NULL) 01513 return ERROR_INT("stream not opened", procName, 1); 01514 01515 if (sarrayWriteStream(fp, sa)) 01516 return ERROR_INT("sa not written to stream", procName, 1); 01517 01518 fclose(fp); 01519 return 0; 01520 } 01521 01522 01523 /*! 01524 * sarrayWriteStream() 01525 * 01526 * Input: stream 01527 * sarray 01528 * Returns 0 if OK; 1 on error 01529 * 01530 * Notes: 01531 * (1) This appends a '\n' to each string, which is stripped 01532 * off by sarrayReadStream(). 01533 */ 01534 l_int32 01535 sarrayWriteStream(FILE *fp, 01536 SARRAY *sa) 01537 { 01538 l_int32 i, n, len; 01539 01540 PROCNAME("sarrayWriteStream"); 01541 01542 if (!fp) 01543 return ERROR_INT("stream not defined", procName, 1); 01544 if (!sa) 01545 return ERROR_INT("sa not defined", procName, 1); 01546 01547 n = sarrayGetCount(sa); 01548 fprintf(fp, "\nSarray Version %d\n", SARRAY_VERSION_NUMBER); 01549 fprintf(fp, "Number of strings = %d\n", n); 01550 for (i = 0; i < n; i++) { 01551 len = strlen(sa->array[i]); 01552 fprintf(fp, " %d[%d]: %s\n", i, len, sa->array[i]); 01553 } 01554 fprintf(fp, "\n"); 01555 01556 return 0; 01557 } 01558 01559 01560 /*! 01561 * sarrayAppend() 01562 * 01563 * Input: filename 01564 * sarray 01565 * Return: 0 if OK; 1 on error 01566 */ 01567 l_int32 01568 sarrayAppend(const char *filename, 01569 SARRAY *sa) 01570 { 01571 FILE *fp; 01572 01573 PROCNAME("sarrayAppend"); 01574 01575 if (!filename) 01576 return ERROR_INT("filename not defined", procName, 1); 01577 if (!sa) 01578 return ERROR_INT("sa not defined", procName, 1); 01579 01580 if ((fp = fopenWriteStream(filename, "a")) == NULL) 01581 return ERROR_INT("stream not opened", procName, 1); 01582 01583 if (sarrayWriteStream(fp, sa)) 01584 return ERROR_INT("sa not appended to stream", procName, 1); 01585 01586 fclose(fp); 01587 return 0; 01588 } 01589 01590 01591 /*---------------------------------------------------------------------* 01592 * Directory filenames * 01593 *---------------------------------------------------------------------*/ 01594 /*! 01595 * getNumberedPathnamesInDirectory() 01596 * 01597 * Input: directory name 01598 * substr (<optional> substring filter on filenames; can be NULL) 01599 * numpre (number of characters in name before number) 01600 * numpost (number of characters in name after number, up 01601 * to a dot before an extension) 01602 * including an extension and the dot separator) 01603 * maxnum (only consider page numbers up to this value) 01604 * Return: sarray of sorted pathnames, or NULL on error 01605 * 01606 * Notes: 01607 * (1) Returns the full pathnames of the numbered filenames in 01608 * the directory. The number in the filename is the index 01609 * into the sarray. For indices for which there are no filenames, 01610 * an empty string ("") is placed into the sarray. 01611 * This makes reading numbered files very simple. For example, 01612 * the image whose filename includes number N can be retrieved using 01613 * pixReadIndexed(sa, N); 01614 * (2) If @substr is not NULL, only filenames that contain 01615 * the substring can be included. If @substr is NULL, 01616 * all matching filenames are used. 01617 * (3) If no numbered files are found, it returns an empty sarray, 01618 * with no initialized strings. 01619 * (4) It is assumed that the page number is contained within 01620 * the basename (the filename without directory or extension). 01621 * @numpre is the number of characters in the basename 01622 * preceeding the actual page number; @numpost is the number 01623 * following the page number, up to either the end of the 01624 * basename or a ".", whichever comes first. 01625 * (5) To use a O(n) matching algorithm, the largest page number 01626 * is found and two internal arrays of this size are created. 01627 * This maximum is constrained not to exceed @maxsum, 01628 * to make sure that an unrealistically large number is not 01629 * accidentally used to determine the array sizes. 01630 */ 01631 SARRAY * 01632 getNumberedPathnamesInDirectory(const char *dirname, 01633 const char *substr, 01634 l_int32 numpre, 01635 l_int32 numpost, 01636 l_int32 maxnum) 01637 { 01638 char *fname, *str; 01639 l_int32 i, nfiles, num, index; 01640 SARRAY *sa, *saout; 01641 01642 PROCNAME("getNumberedPathnamesInDirectory"); 01643 01644 if (!dirname) 01645 return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL); 01646 01647 if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL) 01648 return (SARRAY *)ERROR_PTR("sa not made", procName, NULL); 01649 if ((nfiles = sarrayGetCount(sa)) == 0) 01650 return sarrayCreate(1); 01651 01652 /* Find the last file in the sorted array that has a number 01653 * that (a) matches the count pattern and (b) does not 01654 * exceed @maxnum. @maxnum sets an upper limit on the size 01655 * of the sarray. */ 01656 num = 0; 01657 for (i = nfiles - 1; i >= 0; i--) { 01658 fname = sarrayGetString(sa, i, L_NOCOPY); 01659 num = extractNumberFromFilename(fname, numpre, numpost); 01660 if (num < 0) continue; 01661 num = L_MIN(num + 1, maxnum); 01662 break; 01663 } 01664 01665 if (num <= 0) /* none found */ 01666 return sarrayCreate(1); 01667 01668 /* Insert pathnames into the output sarray. 01669 * Ignore numbers that are out of the range of sarray. */ 01670 saout = sarrayCreateInitialized(num, (char *)""); 01671 for (i = 0; i < nfiles; i++) { 01672 fname = sarrayGetString(sa, i, L_NOCOPY); 01673 index = extractNumberFromFilename(fname, numpre, numpost); 01674 if (index < 0 || index >= num) continue; 01675 str = sarrayGetString(saout, index, L_NOCOPY); 01676 if (str[0] != '\0') 01677 L_WARNING_INT("\n Multiple files with same number: %d", 01678 procName, index); 01679 sarrayReplaceString(saout, index, fname, L_COPY); 01680 } 01681 01682 sarrayDestroy(&sa); 01683 return saout; 01684 } 01685 01686 01687 /*! 01688 * getSortedPathnamesInDirectory() 01689 * 01690 * Input: directory name 01691 * substr (<optional> substring filter on filenames; can be NULL) 01692 * firstpage (0-based) 01693 * npages (use 0 for all to the end) 01694 * Return: sarray of sorted pathnames, or NULL on error 01695 * 01696 * Notes: 01697 * (1) If @substr is not NULL, only filenames that contain 01698 * the substring can be returned. If @substr == NULL, 01699 * none of the filenames are filtered out. 01700 * (2) The files in the directory, after optional filtering by 01701 * the substring, are lexically sorted in increasing order. 01702 * The full pathnames are returned for the requested sequence. 01703 * If no files are found after filtering, returns an empty sarray. 01704 */ 01705 SARRAY * 01706 getSortedPathnamesInDirectory(const char *dirname, 01707 const char *substr, 01708 l_int32 firstpage, 01709 l_int32 npages) 01710 { 01711 char *fname, *fullname; 01712 l_int32 i, nfiles, lastpage; 01713 SARRAY *sa, *safiles, *saout; 01714 01715 PROCNAME("getSortedPathnamesInDirectory"); 01716 01717 if (!dirname) 01718 return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL); 01719 01720 if ((sa = getFilenamesInDirectory(dirname)) == NULL) 01721 return (SARRAY *)ERROR_PTR("sa not made", procName, NULL); 01722 safiles = sarraySelectBySubstring(sa, substr); 01723 sarrayDestroy(&sa); 01724 nfiles = sarrayGetCount(safiles); 01725 if (nfiles == 0) { 01726 L_WARNING("no files found", procName); 01727 return safiles; 01728 } 01729 01730 sarraySort(safiles, safiles, L_SORT_INCREASING); 01731 01732 firstpage = L_MIN(L_MAX(firstpage, 0), nfiles - 1); 01733 if (npages == 0) 01734 npages = nfiles - firstpage; 01735 lastpage = L_MIN(firstpage + npages - 1, nfiles - 1); 01736 01737 saout = sarrayCreate(lastpage - firstpage + 1); 01738 for (i = firstpage; i <= lastpage; i++) { 01739 fname = sarrayGetString(safiles, i, L_NOCOPY); 01740 fullname = genPathname(dirname, fname); 01741 sarrayAddString(saout, fullname, L_INSERT); 01742 } 01743 01744 sarrayDestroy(&safiles); 01745 return saout; 01746 } 01747 01748 01749 /*! 01750 * getFilenamesInDirectory() 01751 * 01752 * Input: directory name 01753 * Return: sarray of file names, or NULL on error 01754 * 01755 * Notes: 01756 * (1) The versions compiled under unix and cygwin use the POSIX C 01757 * library commands for handling directories. For windows, 01758 * there is a separate implementation. 01759 * (2) It returns an array of filename tails; i.e., only the part of 01760 * the path after the last slash. 01761 * (3) Use of the d_type field of dirent is not portable: 01762 * "According to POSIX, the dirent structure contains a field 01763 * char d_name[] of unspecified size, with at most NAME_MAX 01764 * characters preceding the terminating null character. Use 01765 * of other fields will harm the portability of your programs." 01766 * (4) As a consequence of (3), we note several things: 01767 * - MINGW doesn't have a d_type member. 01768 * - Older versions of gcc (e.g., 2.95.3) return DT_UNKNOWN 01769 * for d_type from all files. 01770 * On these systems, this function will return directories 01771 * (except for '.' and '..', which are eliminated using 01772 * the d_name field). 01773 */ 01774 01775 #ifndef _WIN32 01776 01777 SARRAY * 01778 getFilenamesInDirectory(const char *dirname) 01779 { 01780 char *name; 01781 l_int32 len; 01782 SARRAY *safiles; 01783 DIR *pdir; 01784 struct dirent *pdirentry; 01785 01786 PROCNAME("getFilenamesInDirectory"); 01787 01788 if (!dirname) 01789 return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL); 01790 01791 if ((pdir = opendir(dirname)) == NULL) 01792 return (SARRAY *)ERROR_PTR("pdir not opened", procName, NULL); 01793 if ((safiles = sarrayCreate(0)) == NULL) 01794 return (SARRAY *)ERROR_PTR("safiles not made", procName, NULL); 01795 while ((pdirentry = readdir(pdir))) { 01796 01797 /* It's nice to ignore directories. For this it is necessary to 01798 * define _BSD_SOURCE in the CC command, because the DT_DIR 01799 * flag is non-standard. */ 01800 #if !defined(__SOLARIS__) 01801 if (pdirentry->d_type == DT_DIR) 01802 continue; 01803 #endif 01804 01805 /* Filter out "." and ".." if they're passed through */ 01806 name = pdirentry->d_name; 01807 len = strlen(name); 01808 if (len == 1 && name[len - 1] == '.') continue; 01809 if (len == 2 && name[len - 1] == '.' && name[len - 2] == '.') continue; 01810 sarrayAddString(safiles, name, L_COPY); 01811 } 01812 closedir(pdir); 01813 01814 return safiles; 01815 } 01816 01817 #else /* _WIN32 */ 01818 01819 /* http://msdn2.microsoft.com/en-us/library/aa365200(VS.85).aspx */ 01820 #include <windows.h> 01821 01822 SARRAY * 01823 getFilenamesInDirectory(const char *dirname) 01824 { 01825 char *pszDir; 01826 char *tempname; 01827 HANDLE hFind = INVALID_HANDLE_VALUE; 01828 SARRAY *safiles; 01829 WIN32_FIND_DATAA ffd; 01830 01831 PROCNAME("getFilenamesInDirectory"); 01832 01833 if (!dirname) 01834 return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL); 01835 01836 tempname = genPathname(dirname, NULL); 01837 pszDir = stringJoin(tempname, "\\*"); 01838 FREE(tempname); 01839 01840 if (strlen(pszDir) + 1 > MAX_PATH) { 01841 FREE(pszDir); 01842 return (SARRAY *)ERROR_PTR("dirname is too long", procName, NULL); 01843 } 01844 01845 if ((safiles = sarrayCreate(0)) == NULL) { 01846 FREE(pszDir); 01847 return (SARRAY *)ERROR_PTR("safiles not made", procName, NULL); 01848 } 01849 01850 hFind = FindFirstFileA(pszDir, &ffd); 01851 if (INVALID_HANDLE_VALUE == hFind) { 01852 sarrayDestroy(&safiles); 01853 FREE(pszDir); 01854 return (SARRAY *)ERROR_PTR("hFind not opened", procName, NULL); 01855 } 01856 01857 while (FindNextFileA(hFind, &ffd) != 0) { 01858 if (ffd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) /* skip dirs */ 01859 continue; 01860 sarrayAddString(safiles, ffd.cFileName, L_COPY); 01861 } 01862 01863 FindClose(hFind); 01864 FREE(pszDir); 01865 return safiles; 01866 } 01867 01868 #endif /* _WIN32 */ 01869