Leptonica 1.68
C Image Processing Library

sarray.c

Go to the documentation of this file.
00001 /*====================================================================*
00002  -  Copyright (C) 2001 Leptonica.  All rights reserved.
00003  -  This software is distributed in the hope that it will be
00004  -  useful, but with NO WARRANTY OF ANY KIND.
00005  -  No author or distributor accepts responsibility to anyone for the
00006  -  consequences of using this software, or for whether it serves any
00007  -  particular purpose or works at all, unless he or she says so in
00008  -  writing.  Everyone is granted permission to copy, modify and
00009  -  redistribute this source code, for commercial or non-commercial
00010  -  purposes, with the following restrictions: (1) the origin of this
00011  -  source code must not be misrepresented; (2) modified versions must
00012  -  be plainly marked as such; and (3) this notice may not be removed
00013  -  or altered from any source or modified source distribution.
00014  *====================================================================*/
00015 
00016 
00017 /*
00018  *   sarray.c
00019  *
00020  *      Create/Destroy/Copy
00021  *          SARRAY    *sarrayCreate()
00022  *          SARRAY    *sarrayCreateInitialized()
00023  *          SARRAY    *sarrayCreateWordsFromString()
00024  *          SARRAY    *sarrayCreateLinesFromString()
00025  *          void      *sarrayDestroy()
00026  *          SARRAY    *sarrayCopy()
00027  *          SARRAY    *sarrayClone()
00028  *
00029  *      Add/Remove string
00030  *          l_int32    sarrayAddString()
00031  *          l_int32    sarrayExtendArray()
00032  *          char      *sarrayRemoveString()
00033  *          l_int32    sarrayReplaceString()
00034  *          l_int32    sarrayClear()
00035  *
00036  *      Accessors
00037  *          l_int32    sarrayGetCount()
00038  *          char     **sarrayGetArray()
00039  *          char      *sarrayGetString()
00040  *          l_int32    sarrayGetRefcount()
00041  *          l_int32    sarrayChangeRefcount()
00042  *
00043  *      Conversion back to string
00044  *          char      *sarrayToString()
00045  *          char      *sarrayToStringRange()
00046  *
00047  *      Concatenate 2 sarrays
00048  *          l_int32    sarrayConcatenate()
00049  *          l_int32    sarrayAppendRange()
00050  *
00051  *      Pad an sarray to be the same size as another sarray
00052  *          l_int32    sarrayPadToSameSize()
00053  *
00054  *      Convert word sarray to (formatted) line sarray
00055  *          SARRAY    *sarrayConvertWordsToLines()
00056  *
00057  *      Split string on separator list
00058  *          SARRAY    *sarraySplitString()
00059  *
00060  *      Filter sarray
00061  *          SARRAY    *sarraySelectBySubstring()
00062  *          SARRAY    *sarraySelectByRange()
00063  *          l_int32    sarrayParseRange()
00064  *
00065  *      Sort
00066  *          SARRAY    *sarraySort()
00067  *          l_int32    stringCompareLexical()
00068  *
00069  *      Serialize for I/O
00070  *          SARRAY    *sarrayRead()
00071  *          SARRAY    *sarrayReadStream()
00072  *          l_int32    sarrayWrite()
00073  *          l_int32    sarrayWriteStream()
00074  *          l_int32    sarrayAppend()
00075  *
00076  *      Directory filenames
00077  *          SARRAY    *getNumberedPathnamesInDirectory()
00078  *          SARRAY    *getSortedPathnamesInDirectory()
00079  *          SARRAY    *getFilenamesInDirectory()
00080  *
00081  *      Comments on usage:
00082  *
00083  *          These functions are important for efficient manipulation
00084  *          of string data.  They have been used in leptonica for
00085  *          generating and parsing text files, and for generating
00086  *          code for compilation.  The user is responsible for
00087  *          correctly disposing of strings that have been extracted
00088  *          from sarrays.
00089  *
00090  *            - When you want a string from an Sarray to inspect it, or
00091  *              plan to make a copy of it later, use sarrayGetString()
00092  *              with copyflag = 0.  In this case, you must neither free
00093  *              the string nor put it directly in another array.
00094  *              We provide the copyflag constant L_NOCOPY, which is 0,
00095  *              for this purpose:
00096  *                 str-not-owned = sarrayGetString(sa, index, L_NOCOPY);
00097  *              To extract a copy of a string, use:
00098  *                 str-owned = sarrayGetString(sa, index, L_COPY);
00099  *
00100  *            - When you want to insert a string that is in one
00101  *              array into another array (always leaving the first
00102  *              array intact), you have two options:
00103  *                 (1) use copyflag = L_COPY to make an immediate copy,
00104  *                     which you must then add to the second array
00105  *                     by insertion; namely,
00106  *                       str-owned = sarrayGetString(sa, index, L_COPY);
00107  *                       sarrayAddString(sa, str-owned, L_INSERT);
00108  *                 (2) use copyflag = L_NOCOPY to get another handle to
00109  *                     the string, in which case you must add
00110  *                     a copy of it to the second string array:
00111  *                       str-not-owned = sarrayGetString(sa, index, L_NOCOPY);
00112  *                       sarrayAddString(sa, str-not-owned, L_COPY).
00113  *
00114  *              In all cases, when you use copyflag = L_COPY to extract
00115  *              a string from an array, you must either free it
00116  *              or insert it in an array that will be freed later.
00117  */
00118 
00119 #include <string.h>
00120 #ifndef _WIN32
00121 #include <dirent.h>     /* unix only */
00122 #endif  /* ! _WIN32 */
00123 #include "allheaders.h"
00124 
00125 static const l_int32  INITIAL_PTR_ARRAYSIZE = 50;     /* n'importe quoi */
00126 static const l_int32  L_BUF_SIZE = 512;
00127 
00128 
00129 /*--------------------------------------------------------------------------*
00130  *                   String array create/destroy/copy/extend                *
00131  *--------------------------------------------------------------------------*/
00132 /*!
00133  *  sarrayCreate()
00134  *
00135  *      Input:  size of string ptr array to be alloc'd
00136  *              (use 0 for default)
00137  *      Return: sarray, or null on error
00138  */
00139 SARRAY *
00140 sarrayCreate(l_int32  n)
00141 {
00142 SARRAY  *sa;
00143 
00144     PROCNAME("sarrayCreate");
00145 
00146     if (n <= 0)
00147         n = INITIAL_PTR_ARRAYSIZE;
00148 
00149     if ((sa = (SARRAY *)CALLOC(1, sizeof(SARRAY))) == NULL)
00150         return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
00151     if ((sa->array = (char **)CALLOC(n, sizeof(char *))) == NULL)
00152         return (SARRAY *)ERROR_PTR("ptr array not made", procName, NULL);
00153 
00154     sa->nalloc = n;
00155     sa->n = 0;
00156     sa->refcount = 1;
00157     return sa;
00158 }
00159 
00160 
00161 /*!
00162  *  sarrayCreateInitialized()
00163  *
00164  *      Input:  n (size of string ptr array to be alloc'd)
00165  *              initstr (string to be initialized on the full array)
00166  *      Return: sarray, or null on error
00167  */
00168 SARRAY *
00169 sarrayCreateInitialized(l_int32  n,
00170                         char    *initstr)
00171 {
00172 l_int32  i;
00173 SARRAY  *sa;
00174 
00175     PROCNAME("sarrayCreateInitialized");
00176 
00177     if (n <= 0)
00178         return (SARRAY *)ERROR_PTR("n must be > 0", procName, NULL);
00179     if (!initstr)
00180         return (SARRAY *)ERROR_PTR("initstr not defined", procName, NULL);
00181 
00182     sa = sarrayCreate(n);
00183     for (i = 0; i < n; i++)
00184         sarrayAddString(sa, initstr, L_COPY);
00185     return sa;
00186 }
00187 
00188 
00189 /*!
00190  *  sarrayCreateWordsFromString()
00191  *
00192  *      Input:  string
00193  *      Return: sarray, or null on error
00194  *
00195  *  Notes:
00196  *      (1) This finds the number of word substrings, creates an sarray
00197  *          of this size, and puts copies of each substring into the sarray.
00198  */
00199 SARRAY *
00200 sarrayCreateWordsFromString(const char  *string)
00201 {
00202 char     separators[] = " \n\t";
00203 l_int32  i, nsub, size, inword;
00204 SARRAY  *sa;
00205 
00206     PROCNAME("sarrayCreateWordsFromString");
00207 
00208     if (!string)
00209         return (SARRAY *)ERROR_PTR("textstr not defined", procName, NULL);
00210 
00211         /* Find the number of words */
00212     size = strlen(string);
00213     nsub = 0;
00214     inword = FALSE;
00215     for (i = 0; i < size; i++) {
00216         if (inword == FALSE &&
00217            (string[i] != ' ' && string[i] != '\t' && string[i] != '\n')) {
00218            inword = TRUE;
00219            nsub++;
00220         }
00221         else if (inword == TRUE &&
00222            (string[i] == ' ' || string[i] == '\t' || string[i] == '\n')) {
00223            inword = FALSE;
00224         }
00225     }
00226 
00227     if ((sa = sarrayCreate(nsub)) == NULL)
00228         return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
00229     sarraySplitString(sa, string, separators);
00230 
00231     return sa;
00232 }
00233 
00234 
00235 /*!
00236  *  sarrayCreateLinesFromString()
00237  *
00238  *      Input:  string
00239  *              blankflag  (0 to exclude blank lines; 1 to include)
00240  *      Return: sarray, or null on error
00241  *
00242  *  Notes:
00243  *      (1) This finds the number of line substrings, each of which
00244  *          ends with a newline, and puts a copy of each substring
00245  *          in a new sarray.
00246  *      (2) The newline characters are removed from each substring.
00247  */
00248 SARRAY *
00249 sarrayCreateLinesFromString(char    *string,
00250                             l_int32  blankflag)
00251 {
00252 l_int32  i, nsub, size, startptr;
00253 char    *cstring, *substring;
00254 SARRAY  *sa;
00255 
00256     PROCNAME("sarrayCreateLinesFromString");
00257 
00258     if (!string)
00259         return (SARRAY *)ERROR_PTR("textstr not defined", procName, NULL);
00260 
00261         /* find the number of lines */
00262     size = strlen(string);
00263     nsub = 0;
00264     for (i = 0; i < size; i++) {
00265         if (string[i] == '\n')
00266             nsub++;
00267     }
00268 
00269     if ((sa = sarrayCreate(nsub)) == NULL)
00270         return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
00271 
00272     if (blankflag) {  /* keep blank lines as null strings */
00273             /* Make a copy for munging */
00274         if ((cstring = stringNew(string)) == NULL)
00275             return (SARRAY *)ERROR_PTR("cstring not made", procName, NULL);
00276             /* We'll insert nulls like strtok */
00277         startptr = 0;
00278         for (i = 0; i < size; i++) {
00279             if (cstring[i] == '\n') {
00280                 cstring[i] = '\0';
00281                 if (i > 0 && cstring[i - 1] == '\r')
00282                     cstring[i - 1] = '\0';  /* also remove Windows CR */
00283                 if ((substring = stringNew(cstring + startptr)) == NULL)
00284                     return (SARRAY *)ERROR_PTR("substring not made",
00285                                                 procName, NULL);
00286                 sarrayAddString(sa, substring, L_INSERT);
00287 /*                fprintf(stderr, "substring = %s\n", substring); */
00288                 startptr = i + 1;
00289             }
00290         }
00291         if (startptr < size) {  /* no newline at end of last line */
00292             if ((substring = stringNew(cstring + startptr)) == NULL)
00293                 return (SARRAY *)ERROR_PTR("substring not made",
00294                                             procName, NULL);
00295             sarrayAddString(sa, substring, L_INSERT);
00296 /*            fprintf(stderr, "substring = %s\n", substring); */
00297         }
00298         FREE(cstring);
00299     }
00300     else {  /* remove blank lines; use strtok */
00301         sarraySplitString(sa, string, "\r\n");
00302     }
00303 
00304     return sa;
00305 }
00306 
00307 
00308 /*!
00309  *  sarrayDestroy()
00310  *
00311  *      Input:  &sarray <to be nulled>
00312  *      Return: void
00313  *
00314  *  Notes:
00315  *      (1) Decrements the ref count and, if 0, destroys the sarray.
00316  *      (2) Always nulls the input ptr.
00317  */
00318 void
00319 sarrayDestroy(SARRAY  **psa)
00320 {
00321 l_int32  i;
00322 SARRAY  *sa;
00323 
00324     PROCNAME("sarrayDestroy");
00325 
00326     if (psa == NULL) {
00327         L_WARNING("ptr address is NULL!", procName);
00328         return;
00329     }
00330     if ((sa = *psa) == NULL)
00331         return;
00332 
00333     sarrayChangeRefcount(sa, -1);
00334     if (sarrayGetRefcount(sa) <= 0) {
00335         if (sa->array) {
00336             for (i = 0; i < sa->n; i++) {
00337                 if (sa->array[i])
00338                     FREE(sa->array[i]);
00339             }
00340             FREE(sa->array);
00341         }
00342         FREE(sa);
00343     }
00344 
00345     *psa = NULL;
00346     return;
00347 }
00348 
00349         
00350 /*!
00351  *  sarrayCopy()
00352  *
00353  *      Input:  sarray
00354  *      Return: copy of sarray, or null on error
00355  */
00356 SARRAY *
00357 sarrayCopy(SARRAY  *sa)
00358 {
00359 l_int32  i;
00360 SARRAY  *csa;
00361 
00362     PROCNAME("sarrayCopy");
00363 
00364     if (!sa)
00365         return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL);
00366 
00367     if ((csa = sarrayCreate(sa->nalloc)) == NULL)
00368         return (SARRAY *)ERROR_PTR("csa not made", procName, NULL);
00369 
00370     for (i = 0; i < sa->n; i++)
00371         sarrayAddString(csa, sa->array[i], L_COPY);
00372 
00373     return csa;
00374 }
00375 
00376 
00377 /*!
00378  *  sarrayClone()
00379  *
00380  *      Input:  sarray
00381  *      Return: ptr to same sarray, or null on error
00382  */
00383 SARRAY *
00384 sarrayClone(SARRAY  *sa)
00385 {
00386     PROCNAME("sarrayClone");
00387 
00388     if (!sa)
00389         return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL);
00390     sarrayChangeRefcount(sa, 1);
00391     return sa;
00392 }
00393 
00394 
00395 /*!
00396  *  sarrayAddString()
00397  *
00398  *      Input:  sarray
00399  *              string  (string to be added)
00400  *              copyflag (L_INSERT, L_COPY)
00401  *      Return: 0 if OK, 1 on error
00402  *
00403  *  Notes:
00404  *      (1) Legacy usage decrees that we always use 0 to insert a string
00405  *          directly and 1 to insert a copy of the string.  The
00406  *          enums for L_INSERT and L_COPY agree with this convention,
00407  *          and will not change in the future.
00408  *      (2) See usage comments at the top of this file.
00409  */
00410 l_int32
00411 sarrayAddString(SARRAY  *sa,
00412                 char    *string,
00413                 l_int32  copyflag)
00414 {
00415 l_int32  n;
00416 
00417     PROCNAME("sarrayAddString");
00418 
00419     if (!sa)
00420         return ERROR_INT("sa not defined", procName, 1);
00421     if (!string)
00422         return ERROR_INT("string not defined", procName, 1);
00423     if (copyflag != L_INSERT && copyflag != L_COPY)
00424         return ERROR_INT("invalid copyflag", procName, 1);
00425     
00426     n = sarrayGetCount(sa);
00427     if (n >= sa->nalloc)
00428         sarrayExtendArray(sa);
00429 
00430     if (copyflag == L_INSERT)
00431         sa->array[n] = string;
00432     else  /* L_COPY */
00433         sa->array[n] = stringNew(string);
00434     sa->n++;
00435 
00436     return 0;
00437 }
00438 
00439 
00440 /*!
00441  *  sarrayExtendArray()
00442  *
00443  *      Input:  sarray
00444  *      Return: 0 if OK, 1 on error
00445  */
00446 l_int32
00447 sarrayExtendArray(SARRAY  *sa)
00448 {
00449     PROCNAME("sarrayExtendArray");
00450 
00451     if (!sa)
00452         return ERROR_INT("sa not defined", procName, 1);
00453 
00454     if ((sa->array = (char **)reallocNew((void **)&sa->array,
00455                               sizeof(char *) * sa->nalloc,
00456                               2 * sizeof(char *) * sa->nalloc)) == NULL)
00457             return ERROR_INT("new ptr array not returned", procName, 1);
00458 
00459     sa->nalloc *= 2;
00460     return 0;
00461 }
00462 
00463 
00464 /*!
00465  *  sarrayRemoveString()
00466  *
00467  *      Input:  sarray
00468  *              index (of string within sarray)
00469  *      Return: removed string, or null on error
00470  */
00471 char *
00472 sarrayRemoveString(SARRAY  *sa,
00473                    l_int32  index)
00474 {
00475 char    *string;
00476 char   **array;
00477 l_int32  i, n, nalloc;
00478 
00479     PROCNAME("sarrayRemoveString");
00480 
00481     if (!sa)
00482         return (char *)ERROR_PTR("sa not defined", procName, NULL);
00483     
00484     if ((array = sarrayGetArray(sa, &nalloc, &n)) == NULL)
00485         return (char *)ERROR_PTR("array not returned", procName, NULL);
00486 
00487     if (index < 0 || index >= n)
00488         return (char *)ERROR_PTR("array index out of bounds", procName, NULL);
00489 
00490     string = array[index];
00491 
00492         /* If removed string is not at end of array, shift
00493          * to fill in, maintaining original ordering.
00494          * Note: if we didn't care about the order, we could
00495          * put the last string array[n - 1] directly into the hole.  */
00496     for (i = index; i < n - 1; i++)
00497         array[i] = array[i + 1];
00498 
00499     sa->n--;
00500     return string;
00501 }
00502 
00503 
00504 /*!
00505  *  sarrayReplaceString()
00506  *
00507  *      Input:  sarray
00508  *              index (of string within sarray to be replaced)
00509  *              newstr (string to replace existing one)
00510  *              copyflag (L_INSERT, L_COPY)
00511  *      Return: 0 if OK, 1 on error
00512  *
00513  *  Notes:
00514  *      (1) This destroys an existing string and replaces it with
00515  *          the new string or a copy of it.
00516  *      (2) By design, an sarray is always compacted, so there are
00517  *          never any holes (null ptrs) in the ptr array up to the
00518  *          current count.
00519  */
00520 l_int32
00521 sarrayReplaceString(SARRAY  *sa,
00522                     l_int32  index,
00523                     char    *newstr,
00524                     l_int32  copyflag)
00525 {
00526 char    *str;
00527 l_int32  n;
00528 
00529     PROCNAME("sarrayReplaceString");
00530 
00531     if (!sa)
00532         return ERROR_INT("sa not defined", procName, 1);
00533     n = sarrayGetCount(sa);
00534     if (index < 0 || index >= n)
00535         return ERROR_INT("array index out of bounds", procName, 1);
00536     if (!newstr)
00537         return ERROR_INT("newstr not defined", procName, 1);
00538     if (copyflag != L_INSERT && copyflag != L_COPY)
00539         return ERROR_INT("invalid copyflag", procName, 1);
00540 
00541     FREE(sa->array[index]);
00542     if (copyflag == L_INSERT)
00543         str = newstr;
00544     else  /* L_COPY */
00545         str = stringNew(newstr);
00546     sa->array[index] = str;
00547     return 0;
00548 }
00549 
00550 
00551 /*!
00552  *  sarrayClear()
00553  *
00554  *      Input:  sarray
00555  *      Return: 0 if OK; 1 on error
00556  */
00557 l_int32
00558 sarrayClear(SARRAY  *sa)
00559 {
00560 l_int32  i;
00561 
00562     PROCNAME("sarrayClear");
00563 
00564     if (!sa)
00565         return ERROR_INT("sa not defined", procName, 1);
00566     for (i = 0; i < sa->n; i++) {  /* free strings and null ptrs */
00567         FREE(sa->array[i]);
00568         sa->array[i] = NULL;
00569     }
00570     sa->n = 0;
00571     return 0;
00572 }
00573 
00574         
00575 /*----------------------------------------------------------------------*
00576  *                               Accessors                              *
00577  *----------------------------------------------------------------------*/
00578 /*!
00579  *  sarrayGetCount()
00580  *
00581  *      Input:  sarray
00582  *      Return: count, or 0 if no strings or on error
00583  */
00584 l_int32
00585 sarrayGetCount(SARRAY  *sa)
00586 {
00587     PROCNAME("sarrayGetCount");
00588 
00589     if (!sa)
00590         return ERROR_INT("sa not defined", procName, 0);
00591     return sa->n;
00592 }
00593         
00594 
00595 /*!
00596  *  sarrayGetArray()
00597  *
00598  *      Input:  sarray
00599  *              &nalloc  (<optional return> number allocated string ptrs)
00600  *              &n  (<optional return> number allocated strings)
00601  *      Return: ptr to string array, or null on error
00602  *
00603  *  Notes:
00604  *      (1) Caution: the returned array is not a copy, so caller
00605  *          must not destroy it!
00606  */
00607 char **
00608 sarrayGetArray(SARRAY   *sa,
00609                l_int32  *pnalloc,
00610                l_int32  *pn)
00611 {
00612 char  **array;
00613 
00614     PROCNAME("sarrayGetArray");
00615 
00616     if (!sa)
00617         return (char **)ERROR_PTR("sa not defined", procName, NULL);
00618 
00619     array = sa->array;
00620     if (pnalloc) *pnalloc = sa->nalloc;
00621     if (pn) *pn = sa->n;
00622 
00623     return array;
00624 }
00625 
00626 
00627 /*!
00628  *  sarrayGetString()
00629  *
00630  *      Input:  sarray
00631  *              index   (to the index-th string)
00632  *              copyflag  (L_NOCOPY or L_COPY)
00633  *      Return: string, or null on error
00634  *
00635  *  Notes:
00636  *      (1) Legacy usage decrees that we always use 0 to get the
00637  *          pointer to the string itself, and 1 to get a copy of
00638  *          the string.
00639  *      (2) See usage comments at the top of this file.
00640  *      (3) To get a pointer to the string itself, use for copyflag:
00641  *             L_NOCOPY or 0 or FALSE
00642  *          To get a copy of the string, use for copyflag:
00643  *             L_COPY or 1 or TRUE
00644  *          The const values of L_NOCOPY and L_COPY are guaranteed not
00645  *          to change.
00646  */
00647 char *
00648 sarrayGetString(SARRAY  *sa,
00649                 l_int32  index,
00650                 l_int32  copyflag)
00651 {
00652     PROCNAME("sarrayGetString");
00653 
00654     if (!sa)
00655         return (char *)ERROR_PTR("sa not defined", procName, NULL);
00656     if (index < 0 || index >= sa->n)
00657         return (char *)ERROR_PTR("index not valid", procName, NULL);
00658     if (copyflag != L_NOCOPY && copyflag != L_COPY)
00659         return (char *)ERROR_PTR("invalid copyflag", procName, NULL);
00660 
00661     if (copyflag == L_NOCOPY)
00662         return sa->array[index];
00663     else  /* L_COPY */
00664         return stringNew(sa->array[index]);
00665 }
00666 
00667 
00668 /*!
00669  *  sarrayGetRefCount()
00670  *
00671  *      Input:  sarray
00672  *      Return: refcount, or UNDEF on error
00673  */
00674 l_int32
00675 sarrayGetRefcount(SARRAY  *sa)
00676 {
00677     PROCNAME("sarrayGetRefcount");
00678 
00679     if (!sa)
00680         return ERROR_INT("sa not defined", procName, UNDEF);
00681     return sa->refcount;
00682 }
00683 
00684 
00685 /*!
00686  *  sarrayChangeRefCount()
00687  *
00688  *      Input:  sarray
00689  *              delta (change to be applied)
00690  *      Return: 0 if OK, 1 on error
00691  */
00692 l_int32
00693 sarrayChangeRefcount(SARRAY  *sa,
00694                      l_int32  delta)
00695 {
00696     PROCNAME("sarrayChangeRefcount");
00697 
00698     if (!sa)
00699         return ERROR_INT("sa not defined", procName, UNDEF);
00700     sa->refcount += delta;
00701     return 0;
00702 }
00703 
00704 
00705 /*----------------------------------------------------------------------*
00706  *                      Conversion to string                           *
00707  *----------------------------------------------------------------------*/
00708 /*!
00709  *  sarrayToString()
00710  *
00711  *      Input:  sarray
00712  *              addnlflag (flag: 0 adds nothing to each substring
00713  *                               1 adds '\n' to each substring
00714  *                               2 adds ' ' to each substring)
00715  *      Return: dest string, or null on error
00716  *
00717  *  Notes:
00718  *      (1) Concatenates all the strings in the sarray, preserving
00719  *          all white space.
00720  *      (2) If addnlflag != 0, adds either a '\n' or a ' ' after
00721  *          each substring.
00722  *      (3) This function was NOT implemented as:
00723  *            for (i = 0; i < n; i++)
00724  *                     strcat(dest, sarrayGetString(sa, i, L_NOCOPY));
00725  *          Do you see why?
00726  */
00727 char *
00728 sarrayToString(SARRAY  *sa,
00729                l_int32  addnlflag)
00730 {
00731     PROCNAME("sarrayToString");
00732 
00733     if (!sa)
00734         return (char *)ERROR_PTR("sa not defined", procName, NULL);
00735 
00736     return sarrayToStringRange(sa, 0, 0, addnlflag);
00737 }
00738 
00739 
00740 /*!
00741  *  sarrayToStringRange()
00742  *
00743  *      Input: sarray
00744  *             first  (index of first string to use; starts with 0)
00745  *             nstrings (number of strings to append into the result; use
00746  *                       0 to append to the end of the sarray)
00747  *             addnlflag (flag: 0 adds nothing to each substring
00748  *                              1 adds '\n' to each substring
00749  *                              2 adds ' ' to each substring)
00750  *      Return: dest string, or null on error
00751  *
00752  *  Notes:
00753  *      (1) Concatenates the specified strings inthe sarray, preserving
00754  *          all white space.
00755  *      (2) If addnlflag != 0, adds either a '\n' or a ' ' after
00756  *          each substring.
00757  *      (3) If the sarray is empty, this returns a string with just
00758  *          the character corresponding to @addnlflag.
00759  */
00760 char *
00761 sarrayToStringRange(SARRAY  *sa,
00762                     l_int32  first,
00763                     l_int32  nstrings,
00764                     l_int32  addnlflag)
00765 {
00766 char    *dest, *src, *str;
00767 l_int32  n, i, last, size, index, len;
00768 
00769     PROCNAME("sarrayToStringRange");
00770 
00771     if (!sa)
00772         return (char *)ERROR_PTR("sa not defined", procName, NULL);
00773     if (addnlflag != 0 && addnlflag != 1 && addnlflag != 2)
00774         return (char *)ERROR_PTR("invalid addnlflag", procName, NULL);
00775 
00776     n = sarrayGetCount(sa);
00777 
00778         /* Empty sa; return char corresponding to addnlflag only */
00779     if (n == 0) {
00780         if (first == 0) {
00781             if (addnlflag == 0)
00782                 return stringNew("");
00783             if (addnlflag == 1)
00784                 return stringNew("\n");
00785             else  /* addnlflag == 2) */
00786                 return stringNew(" ");
00787         }
00788         else
00789             return (char *)ERROR_PTR("first not valid", procName, NULL);
00790     }
00791 
00792     if (first < 0 || first >= n)
00793         return (char *)ERROR_PTR("first not valid", procName, NULL);
00794     if (nstrings == 0 || (nstrings > n - first))
00795         nstrings = n - first;  /* no overflow */
00796     last = first + nstrings - 1;
00797 
00798     size = 0;
00799     for (i = first; i <= last; i++) {
00800         if ((str = sarrayGetString(sa, i, L_NOCOPY)) == NULL)
00801             return (char *)ERROR_PTR("str not found", procName, NULL);
00802         size += strlen(str) + 2;
00803     }
00804 
00805     if ((dest = (char *)CALLOC(size + 1, sizeof(char))) == NULL)
00806         return (char *)ERROR_PTR("dest not made", procName, NULL);
00807 
00808     index = 0;
00809     for (i = first; i <= last; i++) {
00810         src = sarrayGetString(sa, i, L_NOCOPY);
00811         len = strlen(src);
00812         memcpy(dest + index, src, len);
00813         index += len;
00814         if (addnlflag == 1) {
00815             dest[index] = '\n';
00816             index++;
00817         }
00818         else if (addnlflag == 2) {
00819             dest[index] = ' ';
00820             index++;
00821         }
00822     }
00823 
00824     return dest;
00825 }
00826 
00827 
00828 /*----------------------------------------------------------------------*
00829  *                      Concatenate 2 sarrays                           *
00830  *----------------------------------------------------------------------*/
00831 /*!
00832  *  sarrayConcatenate()
00833  *
00834  *      Input:  sa1  (to be added to)
00835  *              sa2  (append to sa1)
00836  *      Return: 0 if OK, 1 on error
00837  *
00838  *  Notes:
00839  *      (1) Copies of the strings in sarray2 are added to sarray1.
00840  */
00841 l_int32
00842 sarrayConcatenate(SARRAY  *sa1,
00843                   SARRAY  *sa2)
00844 {
00845 char    *str;
00846 l_int32  n, i;
00847 
00848     PROCNAME("sarrayConcatenate");
00849 
00850     if (!sa1)
00851         return ERROR_INT("sa1 not defined", procName, 1);
00852     if (!sa2)
00853         return ERROR_INT("sa2 not defined", procName, 1);
00854 
00855     n = sarrayGetCount(sa2);
00856     for (i = 0; i < n; i++) {
00857         str = sarrayGetString(sa2, i, L_NOCOPY);
00858         sarrayAddString(sa1, str, L_COPY);
00859     }
00860 
00861     return 0;
00862 }
00863 
00864 
00865 /*!
00866  *  sarrayAppendRange()
00867  *
00868  *      Input:  sa1  (to be added to)
00869  *              sa2  (append specified range of strings in sa2 to sa1)
00870  *              start (index of first string of sa2 to append)
00871  *              end (index of last string of sa2 to append)
00872  *      Return: 0 if OK, 1 on error
00873  *
00874  *  Notes:
00875  *      (1) Copies of the strings in sarray2 are added to sarray1.
00876  *      (2) The [start ... end] range is truncated if necessary.
00877  */
00878 l_int32
00879 sarrayAppendRange(SARRAY  *sa1,
00880                   SARRAY  *sa2,
00881                   l_int32  start,
00882                   l_int32  end)
00883 {
00884 char    *str;
00885 l_int32  n, i;
00886 
00887     PROCNAME("sarrayAppendRange");
00888 
00889     if (!sa1)
00890         return ERROR_INT("sa1 not defined", procName, 1);
00891     if (!sa2)
00892         return ERROR_INT("sa2 not defined", procName, 1);
00893     if (start < 0)
00894         start = 0;
00895     n = sarrayGetCount(sa2);
00896     if (end >= n)
00897         end = n - 1;
00898     if (start > end)
00899         return ERROR_INT("start > end", procName, 1);
00900 
00901     for (i = start; i <= end; i++) {
00902         str = sarrayGetString(sa2, i, L_NOCOPY);
00903         sarrayAddString(sa1, str, L_COPY);
00904     }
00905 
00906     return 0;
00907 }
00908 
00909 
00910 /*----------------------------------------------------------------------*
00911  *          Pad an sarray to be the same size as another sarray         *
00912  *----------------------------------------------------------------------*/
00913 /*!
00914  *  sarrayPadToSameSize()
00915  *
00916  *      Input:  sa1, sa2
00917  *              padstring
00918  *      Return: 0 if OK, 1 on error
00919  *
00920  *  Notes:
00921  *      (1) If two sarrays have different size, this adds enough
00922  *          instances of @padstring to the smaller so that they are
00923  *          the same size.  It is useful when two or more sarrays
00924  *          are being sequenced in parallel, and it is necessary to
00925  *          find a valid string at each index.
00926  */
00927 l_int32
00928 sarrayPadToSameSize(SARRAY  *sa1,
00929                     SARRAY  *sa2,
00930                     char    *padstring)
00931 {
00932 l_int32  i, n1, n2;
00933 
00934     PROCNAME("sarrayPadToSameSize");
00935 
00936     if (!sa1 || !sa2)
00937         return ERROR_INT("both sa1 and sa2 not defined", procName, 1);
00938 
00939     n1 = sarrayGetCount(sa1);
00940     n2 = sarrayGetCount(sa2);
00941     if (n1 < n2) {
00942         for (i = n1; i < n2; i++)
00943             sarrayAddString(sa1, padstring, L_COPY);
00944     }
00945     else if (n1 > n2) {
00946         for (i = n2; i < n1; i++)
00947             sarrayAddString(sa2, padstring, L_COPY);
00948     }
00949 
00950     return 0;
00951 }
00952 
00953 
00954 /*----------------------------------------------------------------------*
00955  *                   Convert word sarray to line sarray                 *
00956  *----------------------------------------------------------------------*/
00957 /*! 
00958  *  sarrayConvertWordsToLines()
00959  *
00960  *      Input:  sa  (sa of individual words)
00961  *              linesize  (max num of chars in each line)
00962  *      Return: saout (sa of formatted lines), or null on error
00963  *
00964  *  This is useful for re-typesetting text to a specific maximum
00965  *  line length.  The individual words in the input sarray
00966  *  are concatenated into textlines.  An input word string of zero
00967  *  length is taken to be a paragraph separator.  Each time
00968  *  such a string is found, the current line is ended and 
00969  *  a new line is also produced that contains just the
00970  *  string of zero length ("").  When the output sarray
00971  *  of lines is eventually converted to a string with newlines
00972  *  (typically) appended to each line string, the empty
00973  *  strings are just converted to newlines, producing the visible
00974  *  paragraph separation.
00975  *
00976  *  What happens when a word is larger than linesize?
00977  *  We write it out as a single line anyway!  Words preceding
00978  *  or following this long word are placed on lines preceding
00979  *  or following the line with the long word.  Why this choice?
00980  *  Long "words" found in text documents are typically URLs, and
00981  *  it's often desirable not to put newlines in the middle of a URL.
00982  *  The text display program (e.g., text editor) will typically
00983  *  wrap the long "word" to fit in the window.
00984  */
00985 SARRAY *
00986 sarrayConvertWordsToLines(SARRAY  *sa,
00987                           l_int32  linesize)
00988 {
00989 char    *wd, *strl;
00990 char     emptystring[] = "";
00991 l_int32  n, i, len, totlen;
00992 SARRAY  *sal, *saout;
00993 
00994     PROCNAME("sarrayConvertWordsToLines");
00995 
00996     if (!sa)
00997         return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL);
00998 
00999     if ((saout = sarrayCreate(0)) == NULL)
01000         return (SARRAY *)ERROR_PTR("saout not defined", procName, NULL);
01001 
01002     n = sarrayGetCount(sa);
01003     totlen = 0;
01004     sal = NULL;
01005     for (i = 0; i < n; i++) {
01006         if (!sal) {
01007             if ((sal = sarrayCreate(0)) == NULL)
01008                 return (SARRAY *)ERROR_PTR("sal not made", procName, NULL);
01009         }
01010         wd = sarrayGetString(sa, i, L_NOCOPY);
01011         len = strlen(wd);
01012         if (len == 0) {  /* end of paragraph: end line & insert blank line */
01013             if (totlen > 0) {
01014                 strl = sarrayToString(sal, 2);
01015                 sarrayAddString(saout, strl, L_INSERT);
01016             }
01017             sarrayAddString(saout, emptystring, L_COPY);
01018             sarrayDestroy(&sal);
01019             totlen = 0;
01020         }
01021         else if (totlen == 0 && len + 1 > linesize) {  /* long word! */
01022             sarrayAddString(saout, wd, L_COPY);  /* copy to one line */
01023         }
01024         else if (totlen + len + 1 > linesize) {  /* end line & start new one */
01025             strl = sarrayToString(sal, 2);
01026             sarrayAddString(saout, strl, L_INSERT);
01027             sarrayDestroy(&sal);
01028             if ((sal = sarrayCreate(0)) == NULL)
01029                 return (SARRAY *)ERROR_PTR("sal not made", procName, NULL);
01030             sarrayAddString(sal, wd, L_COPY);
01031             totlen = len + 1;
01032         }
01033         else {   /* add to current line */
01034             sarrayAddString(sal, wd, L_COPY);
01035             totlen += len + 1;
01036         }
01037     }
01038     if (totlen > 0) {   /* didn't end with blank line; output last line */
01039         strl = sarrayToString(sal, 2);
01040         sarrayAddString(saout, strl, L_INSERT);
01041         sarrayDestroy(&sal);
01042     }
01043 
01044     return saout;
01045 
01046 }
01047 
01048 
01049 /*----------------------------------------------------------------------*
01050  *                    Split string on separator list                    *
01051  *----------------------------------------------------------------------*/
01052 /*
01053  *  sarraySplitString()
01054  *
01055  *      Input:  sa (to append to; typically empty initially)
01056  *              str (string to split; not changed)
01057  *              separators (characters that split input string)
01058  *      Return: 0 if OK, 1 on error.
01059  *
01060  *  Notes:
01061  *      (1) This uses strtokSafe().  See the notes there in utils.c.
01062  */
01063 l_int32
01064 sarraySplitString(SARRAY      *sa,
01065                   const char  *str,
01066                   const char  *separators)
01067 {
01068 char  *cstr, *substr, *saveptr;
01069 
01070     PROCNAME("sarraySplitString");
01071 
01072     if (!sa)
01073         return ERROR_INT("sa not defined", procName, 1);
01074     if (!str)
01075         return ERROR_INT("str not defined", procName, 1);
01076     if (!separators)
01077         return ERROR_INT("separators not defined", procName, 1);
01078 
01079     cstr = stringNew(str);  /* preserves const-ness of input str */
01080     substr = strtokSafe(cstr, separators, &saveptr);
01081     if (substr)
01082         sarrayAddString(sa, substr, L_INSERT);
01083     while ((substr = strtokSafe(NULL, separators, &saveptr)))
01084         sarrayAddString(sa, substr, L_INSERT);
01085     FREE(cstr);
01086 
01087     return 0;
01088 }
01089 
01090 
01091 /*----------------------------------------------------------------------*
01092  *                              Filter sarray                           *
01093  *----------------------------------------------------------------------*/
01094 /*!
01095  *  sarraySelectBySubstring()
01096  *
01097  *      Input:  sain (input sarray)
01098  *              substr (<optional> substring for matching; can be NULL)
01099  *      Return: saout (output sarray, filtered with substring) or null on error
01100  *
01101  *  Notes:
01102  *      (1) This selects all strings in sain that have substr as a substring.
01103  *          Note that we can't use strncmp() because we're looking for
01104  *          a match to the substring anywhere within each filename.
01105  *      (2) If substr == NULL, returns a copy of the sarray.
01106  */
01107 SARRAY *
01108 sarraySelectBySubstring(SARRAY      *sain,
01109                         const char  *substr)
01110 {
01111 char    *str;
01112 l_int32  n, i, offset, found;
01113 SARRAY  *saout;
01114 
01115     PROCNAME("sarraySelectBySubstring");
01116 
01117     if (!sain)
01118         return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL);
01119 
01120     n = sarrayGetCount(sain);
01121     if (!substr || n == 0)
01122         return sarrayCopy(sain);
01123 
01124     saout = sarrayCreate(n);
01125     for (i = 0; i < n; i++) {
01126         str = sarrayGetString(sain, i, L_NOCOPY);
01127         arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
01128                           strlen(substr), &offset, &found);
01129         if (found)
01130             sarrayAddString(saout, str, L_COPY);
01131     }
01132 
01133     return saout;
01134 }
01135 
01136 
01137 /*!
01138  *  sarraySelectByRange()
01139  *
01140  *      Input:  sain (input sarray)
01141  *              first (index of first string to be selected)
01142  *              last (index of last string to be selected; use 0 to go to the
01143  *                    end of the sarray)
01144  *      Return: saout (output sarray), or null on error
01145  *
01146  *  Notes:
01147  *      (1) This makes @saout consisting of copies of all strings in @sain
01148  *          in the index set [first ... last].  Use @last == 0 to get all
01149  *          strings from @first to the last string in the sarray.
01150  */
01151 SARRAY *
01152 sarraySelectByRange(SARRAY  *sain,
01153                     l_int32  first,
01154                     l_int32  last)
01155 {
01156 char    *str;
01157 l_int32  n, i;
01158 SARRAY  *saout;
01159 
01160     PROCNAME("sarraySelectByRange");
01161 
01162     if (!sain)
01163         return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL);
01164     if (first < 0) first = 0;
01165     n = sarrayGetCount(sain);
01166     if (last <= 0) last = n - 1;
01167     if (last >= n) {
01168         L_WARNING("@last > n - 1; setting to n - 1", procName);
01169         last = n - 1;
01170     }
01171     if (first > last)
01172         return (SARRAY *)ERROR_PTR("first must be >= last", procName, NULL);
01173 
01174     saout = sarrayCreate(0);
01175     for (i = first; i <= last; i++) {
01176         str = sarrayGetString(sain, i, L_COPY);
01177         sarrayAddString(saout, str, L_INSERT);
01178     }
01179 
01180     return saout;
01181 }
01182 
01183 
01184 /*!
01185  *  sarrayParseRange()
01186  *
01187  *      Input:  sa (input sarray)
01188  *              start (index to start range search)
01189  *             &actualstart (<return> index of actual start; may be > 'start')
01190  *             &end (<return> index of end)
01191  *             &newstart (<return> index of start of next range)
01192  *              substr (substring for matching at beginning of string)
01193  *              loc (byte offset within the string for the pattern; use
01194  *                   -1 if the location does not matter);
01195  *      Return: 0 if valid range found; 1 otherwise
01196  *
01197  *  Notes:
01198  *      (1) This finds the range of the next set of strings in SA,
01199  *          beginning the search at 'start', that does NOT have
01200  *          the substring 'substr' either at the indicated location
01201  *          in the string or anywhere in the string.  The input
01202  *          variable 'loc' is the specified offset within the string;
01203  *          use -1 to indicate 'anywhere in the string'.
01204  *      (2) Always check the return value to verify that a valid range
01205  *          was found.
01206  *      (3) If a valid range is not found, the values of actstart,
01207  *          end and newstart are all set to the size of sa.
01208  *      (4) If this is the last valid range, newstart returns the value n.
01209  *          In use, this should be tested before calling the function.
01210  *      (5) Usage example.  To find all the valid ranges in a file
01211  *          where the invalid lines begin with two dashes, copy each
01212  *          line in the file to a string in an sarray, and do:
01213  *             start = 0;
01214  *             while (!sarrayParseRange(sa, start, &actstart, &end, &start,
01215  *                    "--", 0))
01216  *                 fprintf(stderr, "start = %d, end = %d\n", actstart, end);
01217  */
01218 l_int32
01219 sarrayParseRange(SARRAY      *sa,
01220                  l_int32      start,
01221                  l_int32     *pactualstart,
01222                  l_int32     *pend,
01223                  l_int32     *pnewstart,
01224                  const char  *substr,
01225                  l_int32      loc)
01226 {
01227 char    *str;
01228 l_int32  n, i, offset, found;
01229 
01230     PROCNAME("sarrayParseRange");
01231 
01232     if (!sa)
01233         return ERROR_INT("sa not defined", procName, 1);
01234     if (!pactualstart || !pend || !pnewstart)
01235         return ERROR_INT("not all range addresses defined", procName, 1);
01236     n = sarrayGetCount(sa);
01237     *pactualstart = *pend = *pnewstart = n;
01238     if (!substr)
01239         return ERROR_INT("substr not defined", procName, 1);
01240 
01241         /* Look for the first string without the marker */
01242     if (start < 0 || start >= n)
01243         return 1;
01244     for (i = start; i < n; i++) {
01245         str = sarrayGetString(sa, i, L_NOCOPY);
01246         arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
01247                           strlen(substr), &offset, &found);
01248         if (loc < 0) {
01249             if (!found) break;
01250         } else {
01251             if (!found || offset != loc) break;
01252         }
01253     }
01254     start = i;
01255     if (i == n)  /* couldn't get started */
01256         return 1;
01257 
01258         /* Look for the last string without the marker */
01259     *pactualstart = start;
01260     for (i = start + 1; i < n; i++) {
01261         str = sarrayGetString(sa, i, L_NOCOPY);
01262         arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
01263                           strlen(substr), &offset, &found);
01264         if (loc < 0) {
01265             if (found) break;
01266         } else {
01267             if (found && offset == loc) break;
01268         }
01269     }
01270     *pend = i - 1;
01271     start = i;
01272     if (i == n)  /* no further range */
01273         return 0;
01274 
01275         /* Look for the first string after *pend without the marker.
01276          * This will start the next run of strings, if it exists. */
01277     for (i = start; i < n; i++) {
01278         str = sarrayGetString(sa, i, L_NOCOPY);
01279         arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
01280                           strlen(substr), &offset, &found);
01281         if (loc < 0) {
01282             if (!found) break;
01283         } else {
01284             if (!found || offset != loc) break;
01285         }
01286     }
01287     if (i < n)
01288         *pnewstart = i;
01289 
01290     return 0;
01291 }
01292 
01293 
01294 /*----------------------------------------------------------------------*
01295  *                                   Sort                               *
01296  *----------------------------------------------------------------------*/
01297 /*!
01298  *  sarraySort()
01299  *
01300  *      Input:  saout (output sarray; can be NULL or equal to sain)
01301  *              sain (input sarray)
01302  *              sortorder (L_SORT_INCREASING or L_SORT_DECREASING)
01303  *      Return: saout (output sarray, sorted by ascii value), or null on error
01304  *
01305  *  Notes:
01306  *      (1) Set saout = sain for in-place; otherwise, set naout = NULL.
01307  *      (2) Shell sort, modified from K&R, 2nd edition, p.62.
01308  *          Slow but simple O(n logn) sort.
01309  */
01310 SARRAY *
01311 sarraySort(SARRAY  *saout,
01312            SARRAY  *sain,
01313            l_int32  sortorder)
01314 {
01315 char   **array;
01316 char    *tmp;
01317 l_int32  n, i, j, gap;
01318 
01319     PROCNAME("sarraySort");
01320 
01321     if (!sain)
01322         return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL);
01323 
01324         /* Make saout if necessary; otherwise do in-place */
01325     if (!saout)
01326         saout = sarrayCopy(sain);
01327     else if (sain != saout)
01328         return (SARRAY *)ERROR_PTR("invalid: not in-place", procName, NULL);
01329     array = saout->array;  /* operate directly on the array */
01330     n = sarrayGetCount(saout);
01331 
01332         /* Shell sort */
01333     for (gap = n/2; gap > 0; gap = gap / 2) {
01334         for (i = gap; i < n; i++) {
01335             for (j = i - gap; j >= 0; j -= gap) {
01336                 if ((sortorder == L_SORT_INCREASING &&
01337                      stringCompareLexical(array[j], array[j + gap])) ||
01338                     (sortorder == L_SORT_DECREASING &&
01339                      stringCompareLexical(array[j + gap], array[j])))
01340                 {
01341                     tmp = array[j];
01342                     array[j] = array[j + gap];
01343                     array[j + gap] = tmp;
01344                 }
01345             }
01346         }
01347     }
01348 
01349     return saout;
01350 }
01351 
01352 
01353 /*!
01354  *  stringCompareLexical()
01355  *
01356  *      Input:  str1
01357  *              str2
01358  *      Return: 1 if str1 > str2 (lexically); 0 otherwise
01359  *
01360  *  Notes:
01361  *      (1) If the lexical values are identical, return a 0, to
01362  *          indicate that no swapping is required to sort the strings.
01363  */
01364 l_int32
01365 stringCompareLexical(const char *str1,
01366                      const char *str2)
01367 {
01368 l_int32  i, len1, len2, len;
01369 
01370     PROCNAME("sarrayCompareLexical");
01371 
01372     if (!str1)
01373         return ERROR_INT("str1 not defined", procName, 1);
01374     if (!str2)
01375         return ERROR_INT("str2 not defined", procName, 1);
01376 
01377     len1 = strlen(str1);
01378     len2 = strlen(str2);
01379     len = L_MIN(len1, len2);
01380 
01381     for (i = 0; i < len; i++) {
01382         if (str1[i] == str2[i])
01383             continue;
01384         if (str1[i] > str2[i])
01385             return 1;
01386         else
01387             return 0;
01388     }
01389 
01390     if (len1 > len2)
01391         return 1;
01392     else
01393         return 0;
01394 }
01395 
01396 
01397 /*----------------------------------------------------------------------*
01398  *                           Serialize for I/O                          *
01399  *----------------------------------------------------------------------*/
01400 /*!
01401  *  sarrayRead()
01402  *
01403  *      Input:  filename
01404  *      Return: sarray, or null on error
01405  */
01406 SARRAY *
01407 sarrayRead(const char  *filename)
01408 {
01409 FILE    *fp;
01410 SARRAY  *sa;
01411 
01412     PROCNAME("sarrayRead");
01413 
01414     if (!filename)
01415         return (SARRAY *)ERROR_PTR("filename not defined", procName, NULL);
01416 
01417     if ((fp = fopenReadStream(filename)) == NULL)
01418         return (SARRAY *)ERROR_PTR("stream not opened", procName, NULL);
01419 
01420     if ((sa = sarrayReadStream(fp)) == NULL) {
01421         fclose(fp);
01422         return (SARRAY *)ERROR_PTR("sa not read", procName, NULL);
01423     }
01424 
01425     fclose(fp);
01426     return sa;
01427 }
01428 
01429 
01430 /*!
01431  *  sarrayReadStream()
01432  *
01433  *      Input:  stream
01434  *      Return: sarray, or null on error
01435  *
01436  *  Notes:
01437  *      (1) We store the size of each string along with the string.
01438  *      (2) This allows a string to have embedded newlines.  By reading
01439  *          the entire string, as determined by its size, we are
01440  *          not affected by any number of embedded newlines.
01441  */
01442 SARRAY *
01443 sarrayReadStream(FILE  *fp)
01444 {
01445 char    *stringbuf;
01446 l_int32  i, n, size, index, bufsize, version, ignore;
01447 SARRAY  *sa;
01448 
01449     PROCNAME("sarrayReadStream");
01450 
01451     if (!fp)
01452         return (SARRAY *)ERROR_PTR("stream not defined", procName, NULL);
01453 
01454     if (fscanf(fp, "\nSarray Version %d\n", &version) != 1)
01455         return (SARRAY *)ERROR_PTR("not an sarray file", procName, NULL);
01456     if (version != SARRAY_VERSION_NUMBER)
01457         return (SARRAY *)ERROR_PTR("invalid sarray version", procName, NULL);
01458     if (fscanf(fp, "Number of strings = %d\n", &n) != 1)
01459         return (SARRAY *)ERROR_PTR("error on # strings", procName, NULL);
01460 
01461     if ((sa = sarrayCreate(n)) == NULL)
01462         return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
01463     bufsize = L_BUF_SIZE + 1;
01464     if ((stringbuf = (char *)CALLOC(bufsize, sizeof(char))) == NULL)
01465         return (SARRAY *)ERROR_PTR("stringbuf not made", procName, NULL);
01466 
01467     for (i = 0; i < n; i++) {
01468             /* Get the size of the stored string */
01469         if (fscanf(fp, "%d[%d]:", &index, &size) != 2)
01470             return (SARRAY *)ERROR_PTR("error on string size", procName, NULL);
01471             /* Expand the string buffer if necessary */
01472         if (size > bufsize - 5) {
01473             FREE(stringbuf);
01474             bufsize = (l_int32)(1.5 * size);
01475             stringbuf = (char *)CALLOC(bufsize, sizeof(char));
01476         }
01477             /* Read the stored string, plus leading spaces and trailing \n */
01478         if (fread(stringbuf, 1, size + 3, fp) != size + 3)
01479             return (SARRAY *)ERROR_PTR("error reading string", procName, NULL);
01480             /* Remove the \n that was added by sarrayWriteStream() */
01481         stringbuf[size + 2] = '\0';
01482             /* Copy it in, skipping the 2 leading spaces */
01483         sarrayAddString(sa, stringbuf + 2, L_COPY);
01484     }
01485     ignore = fscanf(fp, "\n");
01486 
01487     FREE(stringbuf);
01488     return sa;
01489 }
01490 
01491 
01492 /*!
01493  *  sarrayWrite()
01494  *
01495  *      Input:  filename
01496  *              sarray
01497  *      Return: 0 if OK; 1 on error
01498  */
01499 l_int32
01500 sarrayWrite(const char  *filename,
01501             SARRAY      *sa)
01502 {
01503 FILE  *fp;
01504 
01505     PROCNAME("sarrayWrite");
01506 
01507     if (!filename)
01508         return ERROR_INT("filename not defined", procName, 1);
01509     if (!sa)
01510         return ERROR_INT("sa not defined", procName, 1);
01511 
01512     if ((fp = fopenWriteStream(filename, "w")) == NULL)
01513         return ERROR_INT("stream not opened", procName, 1);
01514 
01515     if (sarrayWriteStream(fp, sa))
01516         return ERROR_INT("sa not written to stream", procName, 1);
01517     
01518     fclose(fp);
01519     return 0;
01520 }
01521 
01522 
01523 /*!
01524  *  sarrayWriteStream()
01525  *
01526  *      Input:  stream
01527  *              sarray
01528  *      Returns 0 if OK; 1 on error
01529  *
01530  *  Notes:
01531  *      (1) This appends a '\n' to each string, which is stripped
01532  *          off by sarrayReadStream().
01533  */
01534 l_int32
01535 sarrayWriteStream(FILE    *fp,
01536                   SARRAY  *sa)
01537 {
01538 l_int32  i, n, len;
01539 
01540     PROCNAME("sarrayWriteStream");
01541 
01542     if (!fp)
01543         return ERROR_INT("stream not defined", procName, 1);
01544     if (!sa)
01545         return ERROR_INT("sa not defined", procName, 1);
01546 
01547     n = sarrayGetCount(sa);
01548     fprintf(fp, "\nSarray Version %d\n", SARRAY_VERSION_NUMBER);
01549     fprintf(fp, "Number of strings = %d\n", n);
01550     for (i = 0; i < n; i++) {
01551         len = strlen(sa->array[i]);
01552         fprintf(fp, "  %d[%d]:  %s\n", i, len, sa->array[i]);
01553     }
01554     fprintf(fp, "\n");
01555 
01556     return 0;
01557 }
01558 
01559 
01560 /*!
01561  *  sarrayAppend()
01562  *
01563  *      Input:  filename
01564  *              sarray
01565  *      Return: 0 if OK; 1 on error
01566  */
01567 l_int32
01568 sarrayAppend(const char  *filename,
01569              SARRAY      *sa)
01570 {
01571 FILE  *fp;
01572 
01573     PROCNAME("sarrayAppend");
01574 
01575     if (!filename)
01576         return ERROR_INT("filename not defined", procName, 1);
01577     if (!sa)
01578         return ERROR_INT("sa not defined", procName, 1);
01579 
01580     if ((fp = fopenWriteStream(filename, "a")) == NULL)
01581         return ERROR_INT("stream not opened", procName, 1);
01582 
01583     if (sarrayWriteStream(fp, sa))
01584         return ERROR_INT("sa not appended to stream", procName, 1);
01585     
01586     fclose(fp);
01587     return 0;
01588 }
01589 
01590 
01591 /*---------------------------------------------------------------------*
01592  *                           Directory filenames                       *
01593  *---------------------------------------------------------------------*/
01594 /*!
01595  *  getNumberedPathnamesInDirectory()
01596  *
01597  *      Input:  directory name
01598  *              substr (<optional> substring filter on filenames; can be NULL)
01599  *              numpre (number of characters in name before number)
01600  *              numpost (number of characters in name after number, up
01601  *                       to a dot before an extension)
01602  *                       including an extension and the dot separator)
01603  *              maxnum (only consider page numbers up to this value)
01604  *      Return: sarray of sorted pathnames, or NULL on error
01605  *
01606  *  Notes:
01607  *      (1) Returns the full pathnames of the numbered filenames in
01608  *          the directory.  The number in the filename is the index
01609  *          into the sarray.  For indices for which there are no filenames,
01610  *          an empty string ("") is placed into the sarray.
01611  *          This makes reading numbered files very simple.  For example,
01612  *          the image whose filename includes number N can be retrieved using
01613  *               pixReadIndexed(sa, N);
01614  *      (2) If @substr is not NULL, only filenames that contain
01615  *          the substring can be included.  If @substr is NULL,
01616  *          all matching filenames are used.
01617  *      (3) If no numbered files are found, it returns an empty sarray,
01618  *          with no initialized strings.
01619  *      (4) It is assumed that the page number is contained within
01620  *          the basename (the filename without directory or extension).
01621  *          @numpre is the number of characters in the basename
01622  *          preceeding the actual page number; @numpost is the number
01623  *          following the page number, up to either the end of the
01624  *          basename or a ".", whichever comes first.
01625  *      (5) To use a O(n) matching algorithm, the largest page number
01626  *          is found and two internal arrays of this size are created.
01627  *          This maximum is constrained not to exceed @maxsum,
01628  *          to make sure that an unrealistically large number is not
01629  *          accidentally used to determine the array sizes.
01630  */
01631 SARRAY *
01632 getNumberedPathnamesInDirectory(const char  *dirname,
01633                                 const char  *substr,
01634                                 l_int32      numpre,
01635                                 l_int32      numpost,
01636                                 l_int32      maxnum)
01637 {
01638 char    *fname, *str;
01639 l_int32  i, nfiles, num, index;
01640 SARRAY  *sa, *saout;
01641 
01642     PROCNAME("getNumberedPathnamesInDirectory");
01643 
01644     if (!dirname)
01645         return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL);
01646 
01647     if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
01648         return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
01649     if ((nfiles = sarrayGetCount(sa)) == 0)
01650         return sarrayCreate(1);
01651 
01652         /* Find the last file in the sorted array that has a number
01653          * that (a) matches the count pattern and (b) does not
01654          * exceed @maxnum.  @maxnum sets an upper limit on the size
01655          * of the sarray.  */
01656     num = 0;
01657     for (i = nfiles - 1; i >= 0; i--) {
01658       fname = sarrayGetString(sa, i, L_NOCOPY);
01659       num = extractNumberFromFilename(fname, numpre, numpost);
01660       if (num < 0) continue;
01661       num = L_MIN(num + 1, maxnum);
01662       break;
01663     }
01664 
01665     if (num <= 0)  /* none found */
01666         return sarrayCreate(1);
01667 
01668         /* Insert pathnames into the output sarray.
01669          * Ignore numbers that are out of the range of sarray. */
01670     saout = sarrayCreateInitialized(num, (char *)"");
01671     for (i = 0; i < nfiles; i++) {
01672       fname = sarrayGetString(sa, i, L_NOCOPY);
01673       index = extractNumberFromFilename(fname, numpre, numpost);
01674       if (index < 0 || index >= num) continue;
01675       str = sarrayGetString(saout, index, L_NOCOPY);
01676       if (str[0] != '\0')
01677           L_WARNING_INT("\n  Multiple files with same number: %d",
01678                         procName, index);
01679       sarrayReplaceString(saout, index, fname, L_COPY);
01680     }
01681 
01682     sarrayDestroy(&sa);
01683     return saout;
01684 }
01685 
01686 
01687 /*!
01688  *  getSortedPathnamesInDirectory()
01689  *
01690  *      Input:  directory name
01691  *              substr (<optional> substring filter on filenames; can be NULL)
01692  *              firstpage (0-based)
01693  *              npages (use 0 for all to the end)
01694  *      Return: sarray of sorted pathnames, or NULL on error
01695  *
01696  *  Notes:
01697  *      (1) If @substr is not NULL, only filenames that contain
01698  *          the substring can be returned.  If @substr == NULL,
01699  *          none of the filenames are filtered out.
01700  *      (2) The files in the directory, after optional filtering by
01701  *          the substring, are lexically sorted in increasing order.
01702  *          The full pathnames are returned for the requested sequence.
01703  *          If no files are found after filtering, returns an empty sarray.
01704  */
01705 SARRAY *
01706 getSortedPathnamesInDirectory(const char  *dirname,
01707                               const char  *substr,
01708                               l_int32      firstpage,
01709                               l_int32      npages)
01710 {
01711 char    *fname, *fullname;
01712 l_int32  i, nfiles, lastpage;
01713 SARRAY  *sa, *safiles, *saout;
01714 
01715     PROCNAME("getSortedPathnamesInDirectory");
01716 
01717     if (!dirname)
01718         return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL);
01719 
01720     if ((sa = getFilenamesInDirectory(dirname)) == NULL)
01721         return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
01722     safiles = sarraySelectBySubstring(sa, substr);
01723     sarrayDestroy(&sa);
01724     nfiles = sarrayGetCount(safiles);
01725     if (nfiles == 0) {
01726         L_WARNING("no files found", procName);
01727         return safiles;
01728     }
01729 
01730     sarraySort(safiles, safiles, L_SORT_INCREASING);
01731 
01732     firstpage = L_MIN(L_MAX(firstpage, 0), nfiles - 1);
01733     if (npages == 0)
01734         npages = nfiles - firstpage;
01735     lastpage = L_MIN(firstpage + npages - 1, nfiles - 1);
01736 
01737     saout = sarrayCreate(lastpage - firstpage + 1);
01738     for (i = firstpage; i <= lastpage; i++) {
01739         fname = sarrayGetString(safiles, i, L_NOCOPY);
01740         fullname = genPathname(dirname, fname);
01741         sarrayAddString(saout, fullname, L_INSERT);
01742     }
01743 
01744     sarrayDestroy(&safiles);
01745     return saout;
01746 }
01747 
01748 
01749 /*!
01750  *  getFilenamesInDirectory()
01751  *
01752  *      Input:  directory name
01753  *      Return: sarray of file names, or NULL on error
01754  *
01755  *  Notes:
01756  *      (1) The versions compiled under unix and cygwin use the POSIX C
01757  *          library commands for handling directories.  For windows,
01758  *          there is a separate implementation.
01759  *      (2) It returns an array of filename tails; i.e., only the part of
01760  *          the path after the last slash.
01761  *      (3) Use of the d_type field of dirent is not portable:
01762  *          "According to POSIX, the dirent structure contains a field
01763  *          char d_name[] of unspecified size, with at most NAME_MAX
01764  *          characters preceding the terminating null character.  Use
01765  *          of other fields will harm the portability of your programs."
01766  *      (4) As a consequence of (3), we note several things:
01767  *           - MINGW doesn't have a d_type member.
01768  *           - Older versions of gcc (e.g., 2.95.3) return DT_UNKNOWN
01769  *             for d_type from all files.
01770  *          On these systems, this function will return directories
01771  *          (except for '.' and '..', which are eliminated using
01772  *          the d_name field).
01773  */
01774 
01775 #ifndef _WIN32
01776 
01777 SARRAY *
01778 getFilenamesInDirectory(const char  *dirname)
01779 {
01780 char           *name;
01781 l_int32         len;
01782 SARRAY         *safiles;
01783 DIR            *pdir;
01784 struct dirent  *pdirentry;
01785 
01786     PROCNAME("getFilenamesInDirectory");
01787 
01788     if (!dirname)
01789         return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL);
01790 
01791     if ((pdir = opendir(dirname)) == NULL)
01792         return (SARRAY *)ERROR_PTR("pdir not opened", procName, NULL);
01793     if ((safiles = sarrayCreate(0)) == NULL)
01794         return (SARRAY *)ERROR_PTR("safiles not made", procName, NULL);
01795     while ((pdirentry = readdir(pdir)))  {
01796 
01797         /* It's nice to ignore directories.  For this it is necessary to
01798          * define _BSD_SOURCE in the CC command, because the DT_DIR
01799          * flag is non-standard.  */ 
01800 #if !defined(__SOLARIS__)
01801         if (pdirentry->d_type == DT_DIR)
01802             continue;
01803 #endif
01804 
01805             /* Filter out "." and ".." if they're passed through */
01806         name = pdirentry->d_name;
01807         len = strlen(name);
01808         if (len == 1 && name[len - 1] == '.') continue;
01809         if (len == 2 && name[len - 1] == '.' && name[len - 2] == '.') continue;
01810         sarrayAddString(safiles, name, L_COPY);
01811     }
01812     closedir(pdir);
01813 
01814     return safiles;
01815 }
01816 
01817 #else  /* _WIN32 */
01818 
01819     /* http://msdn2.microsoft.com/en-us/library/aa365200(VS.85).aspx */
01820 #include <windows.h>
01821 
01822 SARRAY *
01823 getFilenamesInDirectory(const char  *dirname)
01824 {
01825 char             *pszDir;
01826 char             *tempname;
01827 HANDLE            hFind = INVALID_HANDLE_VALUE;
01828 SARRAY           *safiles;
01829 WIN32_FIND_DATAA  ffd;
01830 
01831     PROCNAME("getFilenamesInDirectory");
01832 
01833     if (!dirname)
01834         return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL);
01835 
01836     tempname = genPathname(dirname, NULL);
01837     pszDir = stringJoin(tempname, "\\*");
01838     FREE(tempname);
01839 
01840     if (strlen(pszDir) + 1 > MAX_PATH) {
01841         FREE(pszDir);
01842         return (SARRAY *)ERROR_PTR("dirname is too long", procName, NULL);
01843     }
01844 
01845     if ((safiles = sarrayCreate(0)) == NULL) {
01846         FREE(pszDir);
01847         return (SARRAY *)ERROR_PTR("safiles not made", procName, NULL);
01848     }
01849 
01850     hFind = FindFirstFileA(pszDir, &ffd);
01851     if (INVALID_HANDLE_VALUE == hFind) {
01852         sarrayDestroy(&safiles);
01853         FREE(pszDir);
01854         return (SARRAY *)ERROR_PTR("hFind not opened", procName, NULL);
01855     }
01856 
01857     while (FindNextFileA(hFind, &ffd) != 0) {
01858         if (ffd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)  /* skip dirs */
01859             continue;
01860         sarrayAddString(safiles, ffd.cFileName, L_COPY);
01861     }
01862 
01863     FindClose(hFind);
01864     FREE(pszDir);
01865     return safiles;
01866 }
01867 
01868 #endif  /* _WIN32 */
01869 
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Defines