Leptonica 1.68
C Image Processing Library

pdfio.c

Go to the documentation of this file.
00001 /*====================================================================*
00002  -  Copyright (C) 2001 Leptonica.  All rights reserved.
00003  -  This software is distributed in the hope that it will be
00004  -  useful, but with NO WARRANTY OF ANY KIND.
00005  -  No author or distributor accepts responsibility to anyone for the
00006  -  consequences of using this software, or for whether it serves any
00007  -  particular purpose or works at all, unless he or she says so in
00008  -  writing.  Everyone is granted permission to copy, modify and
00009  -  redistribute this source code, for commercial or non-commercial
00010  -  purposes, with the following restrictions: (1) the origin of this
00011  -  source code must not be misrepresented; (2) modified versions must
00012  -  be plainly marked as such; and (3) this notice may not be removed
00013  -  or altered from any source or modified source distribution.
00014  *====================================================================*/
00015 
00016 /*
00017  *  pdfio.c
00018  *
00019  *    |=============================================================|
00020  *    |                         Important note                      |
00021  *    |=============================================================|
00022  *    | Some of these functions require libtiff, libjpeg, and libz  |
00023  *    | If you do not have these libraries, you must set            |
00024  *    |      #define  USE_PDFIO     0                               |
00025  *    | in environ.h.  This will link pdfiostub.c                   |
00026  *    |=============================================================|
00027  *
00028  *     The first set of functions converts a set of images to a multi-page
00029  *     pdf file, with one image on each page.  All images are rendered
00030  *     at the same (input) resolution.  The images can be specified as
00031  *     being in a directory, or they can be in an sarray.  The output
00032  *     pdf can be either a file or an array of bytes in memory.
00033  *
00034  *     The second set of functions implements a pdf output "device driver"
00035  *     for wrapping (encoding) any number of images on a single page
00036  *     in pdf.  The images can be rendered using a pdf viewer,
00037  *     such as gv, evince, xpdf or acroread.
00038  *     See: http://www.adobe.com/devnet/pdf/pdf_reference_archive.html
00039  *
00040  *     The third set of functions (segmented) takes an image, an
00041  *     optional binary mask, an encoding flag, and some other parameters,
00042  *     and generates a single-page mixed raster pdf.
00043  *
00044  *     The fourth set of functions (concatenated) takes a set of single-page
00045  *     pdf files and concatenates them into a multi-page pdf
00046  *
00047  *     1. Convert specified image files to Pdf (one image file per page)
00048  *          l_int32             convertFilesToPdf()
00049  *          l_int32             saConvertFilesToPdf()
00050  *          l_int32             saConvertFilesToPdfData()
00051  *          l_int32             selectDefaultPdfEncoding()
00052  *
00053  *     2. Single page, multi-image converters
00054  *          l_int32             convertToPdf()
00055  *          l_int32             convertImageDataToPdf()
00056  *          l_int32             convertToPdfData()
00057  *          l_int32             convertImageDataToPdfData()
00058  *          l_int32             pixConvertToPdf()
00059  *          l_int32             pixConvertToPdfData()
00060  *          l_int32             pixWriteStreamPdf()
00061  *
00062  *     3. Segmented multi-page, multi-image converter
00063  *          l_int32             convertSegmentedFilesToPdf()
00064  *
00065  *     4. Segmented single page, multi-image converters
00066  *          l_int32             convertToPdfSegmented()
00067  *          l_int32             pixConvertToPdfSegmented()
00068  *          l_int32             convertToPdfDataSegmented()
00069  *          l_int32             pixConvertToPdfDataSegmented()
00070  *
00071  *     Helper functions for generating the output pdf string
00072  *          static l_int32      l_generatePdf()
00073  *          static void         generateFixedStringsPdf()
00074  *          static void         generateMediaboxPdf()
00075  *          static l_int32      generatePageStringPdf()
00076  *          static l_int32      generateContentStringPdf()
00077  *          static l_int32      generatePreXStringsPdf()
00078  *          static l_int32      generateColormapStringsPdf()
00079  *          static void         generateTrailerPdf()
00080  *          static l_int32      makeTrailerStringPdf()
00081  *          static l_int32      generateOutputDataPdf()
00082  *
00083  *     5. Multi-page concatenation
00084  *          l_int32             concatenatePdf()
00085  *          l_int32             saConcatenatePdf()
00086  *          l_int32             ptraConcatenatePdf()
00087  *          l_int32             concatenatePdfToData()
00088  *          l_int32             saConcatenatePdfToData()
00089  *          l_int32             ptraConcatenatePdfToData()
00090  *
00091  *     Helper functions for generating the multi-page pdf output
00092  *          static l_int32      parseTrailerPdf()
00093  *          static char        *generatePagesObjStringPdf()
00094  *          static L_BYTEA     *substituteObjectNumbers()
00095  *
00096  *     Create/destroy/access pdf data
00097  *          static L_PDF_DATA         *pdfdataCreate()
00098  *          static void                pdfdataDestroy()
00099  *          static L_COMPRESSED_DATA  *pdfdataGetCid()
00100  *
00101  *     Set flags for special modes
00102  *          void                l_pdfSetG4ImageMask()
00103  *          void                l_pdfSetDateAndVersion()
00104  *
00105  *     The top-level multi-image functions can be visualized as follows:
00106  *          Output pdf data to file:
00107  *             convertToPdf()  and  convertImageDataToPdf()
00108  *                     --> pixConvertToPdf()
00109  *                           --> pixConvertToPdfData()
00110  *
00111  *          Output pdf data to array in memory:
00112  *             convertToPdfData()  and  convertImageDataToPdfData()
00113  *                     --> pixConvertToPdfData()
00114  *
00115  *     The top-level segmented image functions can be visualized as follows:
00116  *          Output pdf data to file:
00117  *             convertToPdfSegmented()
00118  *                     --> pixConvertToPdfSegmented()
00119  *                           --> pixConvertToPdfDataSegmented()
00120  *
00121  *          Output pdf data to array in memory:
00122  *             convertToPdfDataSegmented()
00123  *                     --> pixConvertToPdfDataSegmented()
00124  *
00125  *     For multi-page concatenation, there are three different types of input
00126  *        (1) directory and optional filename filter
00127  *        (2) sarray of filenames
00128  *        (3) ptra of byte arrays of pdf data
00129  *     and two types of output for the concatenated pdf data
00130  *        (1) filename
00131  *        (2) data array and size
00132  *     High-level interfaces are given for each of the six combinations.
00133  */
00134 
00135 #include <string.h>
00136 #include <math.h>
00137 #include "allheaders.h"
00138 
00139 /* --------------------------------------------*/
00140 #if  USE_PDFIO   /* defined in environ.h */
00141  /* --------------------------------------------*/
00142  
00143     /* Typical scan resolution in ppi (pixels/inch) */
00144 static const l_int32  DEFAULT_INPUT_RES = 300;
00145 
00146     /* Static helpers */
00147 static l_int32   l_generatePdf(l_uint8 **pdata, size_t *pnbytes,
00148                                L_PDF_DATA *lpd);
00149 static void      generateFixedStringsPdf(L_PDF_DATA *lpd);
00150 static void      generateMediaboxPdf(L_PDF_DATA *lpd);
00151 static l_int32   generatePageStringPdf(L_PDF_DATA *lpd);
00152 static l_int32   generateContentStringPdf(L_PDF_DATA *lpd);
00153 static l_int32   generatePreXStringsPdf(L_PDF_DATA *lpd);
00154 static l_int32   generateColormapStringsPdf(L_PDF_DATA *lpd);
00155 static void      generateTrailerPdf(L_PDF_DATA *lpd);
00156 static char     *makeTrailerStringPdf(NUMA *naloc);
00157 static l_int32   generateOutputDataPdf(l_uint8 **pdata, size_t *pnbytes,
00158                                        L_PDF_DATA *lpd);
00159 
00160 static l_int32   parseTrailerPdf(L_BYTEA *bas, NUMA **pna);
00161 static char     *generatePagesObjStringPdf(NUMA *napage);
00162 static L_BYTEA  *substituteObjectNumbers(L_BYTEA *bas, NUMA *na_objs);
00163 
00164 static L_PDF_DATA         *pdfdataCreate(const char *title);
00165 static void                pdfdataDestroy(L_PDF_DATA **plpd);
00166 static L_COMPRESSED_DATA  *pdfdataGetCid(L_PDF_DATA *lpd, l_int32 index);
00167 
00168 
00169 /* ---------------- Defaults for rendering options ----------------- */
00170     /* Output G4 as writing through image mask; this is the default */
00171 static l_int32   var_WRITE_G4_IMAGE_MASK = 1;
00172     /* Write date/time and lib version into pdf; this is the default */
00173 static l_int32   var_WRITE_DATE_AND_VERSION = 1;
00174 
00175 #define L_SMALLBUF   256
00176 #define L_BIGBUF    2048   /* must be able to hold hex colormap */
00177 
00178 
00179 #ifndef  NO_CONSOLE_IO
00180 #define  DEBUG_MULTIPAGE      0
00181 #endif  /* ~NO_CONSOLE_IO */
00182 
00183 
00184 /*---------------------------------------------------------------------*
00185  *    Convert specified image files to Pdf (one image file per page)   *
00186  *---------------------------------------------------------------------*/
00187 /*!
00188  *  convertFilesToPdf()
00189  *
00190  *      Input:  directory name (containing images)
00191  *              substr (<optional> substring filter on filenames; can be NULL)
00192  *              res (input resolution of all images)
00193  *              scalefactor (scaling factor applied to each image)
00194  *              quality (used for JPEG only; 0 for default (75))
00195  *              title (<optional> pdf title; if null, taken from the first
00196  *                     image filename)
00197  *              fileout (pdf file of all images)
00198  *      Return: 0 if OK, 1 on error
00199  *
00200  *  Notes:
00201  *      (1) If @substr is not NULL, only image filenames that contain
00202  *          the substring can be used.  If @substr == NULL, all files
00203  *          in the directory are used.
00204  *      (2) The files in the directory, after optional filtering by
00205  *          the substring, are lexically sorted in increasing order
00206  *          before concatenation.
00207  *      (3) The images are encoded with G4 if 1 bpp; JPEG if 8 bpp without
00208  *          colormap and many colors, or 32 bpp; FLATE for anything else.
00209  */
00210 l_int32
00211 convertFilesToPdf(const char  *dirname,
00212                   const char  *substr,
00213                   l_int32      res,
00214                   l_float32    scalefactor,
00215                   l_int32      quality,
00216                   const char  *title,
00217                   const char  *fileout)
00218 {
00219 l_int32  ret;
00220 SARRAY  *sa;
00221 
00222     PROCNAME("convertFilesToPdf");
00223 
00224     if (!dirname)
00225         return ERROR_INT("dirname not defined", procName, 1);
00226     if (!fileout)
00227         return ERROR_INT("fileout not defined", procName, 1);
00228 
00229     if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
00230         return ERROR_INT("sa not made", procName, 1);
00231     ret = saConvertFilesToPdf(sa, res, scalefactor, quality, title, fileout);
00232     sarrayDestroy(&sa);
00233     return ret;
00234 }
00235 
00236 
00237 /*!
00238  *  saConvertFilesToPdf()
00239  *
00240  *      Input:  sarray (of pathnames for images)
00241  *              res (input resolution of all images)
00242  *              scalefactor (scaling factor applied to each image)
00243  *              quality (used for JPEG only; 0 for default (75))
00244  *              title (<optional> pdf title; if null, taken from the first
00245  *                     image filename)
00246  *              fileout (pdf file of all images)
00247  *      Return: 0 if OK, 1 on error
00248  *
00249  *  Notes:
00250  *      (1) The images are encoded with G4 if 1 bpp; JPEG if 8 bpp without
00251  *          colormap and many colors, or 32 bpp; FLATE for anything else.
00252  */
00253 l_int32
00254 saConvertFilesToPdf(SARRAY      *sa,
00255                     l_int32      res,
00256                     l_float32    scalefactor,
00257                     l_int32      quality,
00258                     const char  *title,
00259                     const char  *fileout)
00260 {
00261 l_uint8  *data;
00262 l_int32   ret;
00263 size_t    nbytes;
00264 
00265     PROCNAME("saConvertFilesToPdf");
00266 
00267     if (!sa)
00268         return ERROR_INT("sa not defined", procName, 1);
00269 
00270     ret = saConvertFilesToPdfData(sa, res, scalefactor, quality, title,
00271                                   &data, &nbytes);
00272     if (ret) {
00273         if (data) FREE(data);
00274         return ERROR_INT("pdf data not made", procName, 1);
00275     }
00276 
00277     ret = l_binaryWrite(fileout, "w", data, nbytes);
00278     FREE(data);
00279     if (ret)
00280         L_ERROR("pdf data not written to file", procName);
00281     return ret;
00282 }
00283 
00284 
00285 /*!
00286  *  saConvertFilesToPdfData()
00287  *
00288  *      Input:  sarray (of pathnames for images)
00289  *              res (input resolution of all images)
00290  *              scalefactor (scaling factor applied to each image)
00291  *              quality (used for JPEG only; 0 for default (75))
00292  *              title (<optional> pdf title; if null, taken from the first
00293  *                     image filename)
00294  *              &data (<return> output pdf data (of all images)
00295  *              &nbytes (<return> size of output pdf data)
00296  *      Return: 0 if OK, 1 on error
00297  *
00298  *  Notes:
00299  *      (1) The images are encoded with G4 if 1 bpp; JPEG if 8 bpp without
00300  *          colormap and many colors, or 32 bpp; FLATE for anything else.
00301  */
00302 l_int32
00303 saConvertFilesToPdfData(SARRAY      *sa,
00304                         l_int32      res,
00305                         l_float32    scalefactor,
00306                         l_int32      quality,
00307                         const char  *title,
00308                         l_uint8    **pdata,
00309                         size_t      *pnbytes)
00310 {
00311 char     *fname;
00312 l_uint8  *imdata;
00313 l_int32   i, n, ret, type, npages, scaledres;
00314 size_t    imbytes;
00315 L_BYTEA  *ba;
00316 PIX      *pixs, *pix;
00317 L_PTRA   *pa_data;
00318 
00319     PROCNAME("saConvertFilesToPdfData");
00320 
00321     if (!sa)
00322         return ERROR_INT("sa not defined", procName, 1);
00323     if (scalefactor <= 0.0) scalefactor = 1.0;
00324 
00325         /* Generate all the encoded pdf strings */
00326     n = sarrayGetCount(sa);
00327     pa_data = ptraCreate(n);
00328     for (i = 0; i < n; i++) {
00329         fname = sarrayGetString(sa, i, L_NOCOPY);
00330         if ((pixs = pixRead(fname)) == NULL) {
00331             L_ERROR_STRING("image not readable from file %s", procName, fname);
00332             continue;
00333         }
00334         if (scalefactor != 1.0)
00335             pix = pixScale(pixs, scalefactor, scalefactor);
00336         else
00337             pix = pixClone(pixs);
00338         scaledres = (l_int32)(res * scalefactor);
00339         if (selectDefaultPdfEncoding(pix, &type)) {
00340             L_ERROR_STRING("encoding type selection failed for file %s",
00341                            procName, fname);
00342             pixDestroy(&pix);
00343             continue;
00344         }
00345         ret = pixConvertToPdfData(pix, type, quality, &imdata, &imbytes,
00346                                   0, 0, scaledres, NULL, 0, title);
00347         pixDestroy(&pix);
00348         pixDestroy(&pixs);
00349         if (ret) {
00350             L_ERROR_STRING("pdf encoding failed for %s", procName, fname);
00351             continue;
00352         }
00353         ba = l_byteaInitFromMem(imdata, imbytes);
00354         if (imdata) FREE(imdata);
00355         ptraAdd(pa_data, ba);
00356     }
00357     ptraGetActualCount(pa_data, &npages);
00358     if (npages == 0) {
00359         L_ERROR("no pdf files made", procName);
00360         ptraDestroy(&pa_data, FALSE, FALSE);
00361         return 1;
00362     }
00363 
00364         /* Concatenate them */
00365     ret = ptraConcatenatePdfToData(pa_data, NULL, pdata, pnbytes);
00366 
00367     ptraGetActualCount(pa_data, &npages);  /* recalculate in case it changes */
00368     for (i = 0; i < npages; i++) {
00369         ba = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
00370         l_byteaDestroy(&ba);
00371     }
00372     ptraDestroy(&pa_data, FALSE, FALSE);
00373     return ret;
00374 }
00375 
00376 
00377 /*!
00378  *  selectDefaultPdfEncoding()
00379  *
00380  *      Input:  pix
00381  *              &type (<return> L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE)
00382  *
00383  *  Notes:
00384  *      (1) This attempts to choose an encoding for the pix that results
00385  *          in the smallest file, assuming that if jpeg encoded, it will
00386  *          use quality = 75.  The decision is approximate, in that
00387  *          (a) all colormapped images will be losslessly encoded with
00388  *          gzip (flate), and (b) an image with less than about 20 colors
00389  *          is likely to be smaller if flate encoded than if encoded
00390  *          as a jpeg (dct).  For example, an image made by pixScaleToGray3()
00391  *          will have 10 colors, and flate encoding will give about
00392  *          twice the compression as jpeg with quality = 75.
00393  */
00394 l_int32
00395 selectDefaultPdfEncoding(PIX      *pix,
00396                          l_int32  *ptype)
00397 {
00398 l_int32   w, h, d, factor, ncolors;
00399 PIXCMAP  *cmap;
00400 
00401     PROCNAME("selectDefaultPdfEncoding");
00402 
00403     if (!pix)
00404         return ERROR_INT("pix not defined", procName, 1);
00405     if (!ptype)
00406         return ERROR_INT("&type not defined", procName, 1);
00407     *ptype = L_FLATE_ENCODE;  /* default universal encoding */
00408     pixGetDimensions(pix, &w, &h, &d);
00409     cmap = pixGetColormap(pix);
00410     if (d == 8 && !cmap) {
00411         factor = L_MAX(1, (l_int32)sqrt((l_float64)(w * h) / 20000.));
00412         pixNumColors(pix, factor, &ncolors);
00413         if (ncolors < 20)
00414             *ptype = L_FLATE_ENCODE;
00415         else
00416             *ptype = L_JPEG_ENCODE;
00417     }
00418     else if (d == 1)
00419         *ptype = L_G4_ENCODE;
00420     else if (cmap || d == 2 || d == 4)
00421         *ptype = L_FLATE_ENCODE;
00422     else if (d == 8 || d == 32)
00423         *ptype = L_JPEG_ENCODE;
00424     else
00425         return ERROR_INT("type selection failure", procName, 1);
00426 
00427     return 0;
00428 }
00429 
00430 
00431 /*---------------------------------------------------------------------*
00432  *                Single page, multi-image converters                  *
00433  *---------------------------------------------------------------------*/
00434 /*!
00435  *  convertToPdf()
00436  *
00437  *      Input:  filein (input image file -- any format)
00438  *              type (L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE)
00439  *              quality (used for JPEG only; 0 for default (75))
00440  *              fileout (output pdf file; only required on last image on page)
00441  *              x, y (location of lower-left corner of image, in pixels,
00442  *                    relative to the PostScript origin (0,0) at
00443  *                    the lower-left corner of the page)
00444  *              res (override the resolution of the input image, in ppi;
00445  *                   use 0 to respect the resolution embedded in the input)
00446  *              &lpd (ptr to lpd, which is created on the first invocation
00447  *                    and returned until last image is processed, at which
00448  *                    time it is destroyed)
00449  *              position (in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
00450  *                       L_LAST_IMAGE)
00451  *              title (<optional> pdf title; if null, taken from the first
00452  *                     image placed on a page; e.g., an input image filename)
00453  *      Return: 0 if OK, 1 on error
00454  *
00455  *  Notes:
00456  *      (1) To wrap only one image in pdf, input @plpd = NULL, and
00457  *          the value of @position will be ignored:
00458  *            convertToPdf(...  type, quality, x, y, res, NULL, 0);
00459  *      (2) To wrap multiple images on a single pdf page, this is called
00460  *          once for each successive image.  Do it this way:
00461  *            L_PDF_DATA   *lpd;
00462  *            convertToPdf(...  type, quality, x, y, res, &lpd, L_FIRST_IMAGE);
00463  *            convertToPdf(...  type, quality, x, y, res, &lpd, L_NEXT_IMAGE);
00464  *            ...
00465  *            convertToPdf(...  type, quality, x, y, res, &lpd, L_LAST_IMAGE);
00466  *          This will write the result to the value of @fileout specified
00467  *          in the first call; succeeding values of @fileout are ignored.
00468  *          On the last call: the pdf data bytes are computed and written
00469  *          to @fileout, lpd is destroyed internally, and the returned
00470  *          value of lpd is null.  So the client has nothing to clean up.
00471  *      (3) (a) Set @res == 0 to respect the resolution embedded in the
00472  *              image file.  If no resolution is embedded, it will be set
00473  *              to the default value.
00474  *          (b) Set @res to some other value to override the file resolution.
00475  *      (4) (a) If the input @res and the resolution of the output device
00476  *              are equal, the image will be "displayed" at the same size
00477  *              as the original.
00478  *          (b) If the input @res is 72, the output device will render
00479  *              the image at 1 pt/pixel.
00480  *          (c) Some possible choices for the default input pix resolution are:
00481  *                 72 ppi     Render pix on any output device at one pt/pixel
00482  *                 96 ppi     Windows default for generated display images
00483  *                300 ppi     Typical default for scanned images.
00484  *              We choose 300, which is sensible for rendering page images.
00485  *              However,  images come from a variety of sources, and
00486  *              some are explicitly created for viewing on a display.
00487  */
00488 l_int32
00489 convertToPdf(const char   *filein,
00490              l_int32       type,
00491              l_int32       quality,
00492              const char   *fileout,
00493              l_int32       x,
00494              l_int32       y,
00495              l_int32       res,
00496              L_PDF_DATA  **plpd,
00497              l_int32       position,
00498              const char   *title)
00499 {
00500 l_uint8  *data;
00501 l_int32   ret;
00502 size_t    nbytes;
00503 
00504     PROCNAME("convertToPdf");
00505 
00506     if (!filein)
00507         return ERROR_INT("filein not defined", procName, 1);
00508     if (!plpd || (position == L_LAST_IMAGE)) {
00509         if (!fileout)
00510             return ERROR_INT("fileout not defined", procName, 1);
00511     }
00512     if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
00513         type != L_FLATE_ENCODE)
00514         return ERROR_INT("invalid conversion type", procName, 1);
00515 
00516     if (convertToPdfData(filein, type, quality, &data, &nbytes, x, y,
00517                          res, plpd, position, title))
00518         return ERROR_INT("pdf data not made", procName, 1);
00519 
00520     if (!plpd || (position == L_LAST_IMAGE)) {
00521         ret = l_binaryWrite(fileout, "w", data, nbytes);
00522         FREE(data);
00523         if (ret)
00524             return ERROR_INT("pdf data not written to file", procName, 1);
00525     }
00526 
00527     return 0;
00528 }
00529 
00530 
00531 /*!
00532  *  convertImageDataToPdf()
00533  *
00534  *      Input:  imdata (array of formatted image data; e.g., png, jpeg)
00535  *              size (size of image data)
00536  *              type (L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE)
00537  *              quality (used for JPEG only; 0 for default (75))
00538  *              fileout (output pdf file; only required on last image on page)
00539  *              x, y (location of lower-left corner of image, in pixels,
00540  *                    relative to the PostScript origin (0,0) at
00541  *                    the lower-left corner of the page)
00542  *              res (override the resolution of the input image, in ppi;
00543  *                   use 0 to respect the resolution embedded in the input)
00544  *              &lpd (ptr to lpd, which is created on the first invocation
00545  *                    and returned until last image is processed, at which
00546  *                    time it is destroyed)
00547  *              position (in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
00548  *                       L_LAST_IMAGE)
00549  *              title (<optional> pdf title; taken from the first image
00550  *                     placed on a page; e.g., an input image filename)
00551  *      Return: 0 if OK, 1 on error
00552  *
00553  *  Notes:
00554  *      (1) If @res == 0 and the input resolution field is 0,
00555  *          this will use DEFAULT_INPUT_RES.
00556  *      (2) See comments in convertToPdf().
00557  */
00558 l_int32
00559 convertImageDataToPdf(l_uint8      *imdata,
00560                       size_t        size,
00561                       l_int32       type,
00562                       l_int32       quality,
00563                       const char   *fileout,
00564                       l_int32       x,
00565                       l_int32       y,
00566                       l_int32       res,
00567                       L_PDF_DATA  **plpd,
00568                       l_int32       position,
00569                       const char   *title)
00570 {
00571 l_int32  ret;
00572 PIX     *pix;
00573 
00574     PROCNAME("convertImageDataToPdf");
00575 
00576     if (!imdata)
00577         return ERROR_INT("image data not defined", procName, 1);
00578     if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
00579         type != L_FLATE_ENCODE)
00580         return ERROR_INT("invalid conversion type", procName, 1);
00581     if (!plpd || (position == L_LAST_IMAGE)) {
00582         if (!fileout)
00583             return ERROR_INT("fileout not defined", procName, 1);
00584     }
00585 
00586     if ((pix = pixReadMem(imdata, size)) == NULL)
00587         return ERROR_INT("pix not read", procName, 1);
00588     ret = pixConvertToPdf(pix, type, quality, fileout, x, y, res,
00589                           plpd, position, title);
00590     pixDestroy(&pix);
00591     return ret;
00592 }
00593 
00594 
00595 /*!
00596  *  convertToPdfData()
00597  *
00598  *      Input:  filein (input image file -- any format)
00599  *              type (L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE)
00600  *              quality (used for JPEG only; 0 for default (75))
00601  *              &data (<return> pdf data in memory)
00602  *              &nbytes (<return> number of bytes in pdf data)
00603  *              x, y (location of lower-left corner of image, in pixels,
00604  *                    relative to the PostScript origin (0,0) at
00605  *                    the lower-left corner of the page)
00606  *              res (override the resolution of the input image, in ppi;
00607  *                   use 0 to respect the resolution embedded in the input)
00608  *              &lpd (ptr to lpd, which is created on the first invocation
00609  *                    and returned until last image is processed, at which
00610  *                    time it is destroyed)
00611  *              position (in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
00612  *                       L_LAST_IMAGE)
00613  *              title (<optional> pdf title; taken from the first image
00614  *                     placed on a page; e.g., an input image filename)
00615  *      Return: 0 if OK, 1 on error
00616  *
00617  *  Notes:
00618  *      (1) If @res == 0 and the input resolution field is 0,
00619  *          this will use DEFAULT_INPUT_RES.
00620  *      (2) See comments in convertToPdf().
00621  */
00622 l_int32
00623 convertToPdfData(const char   *filein,
00624                  l_int32       type,
00625                  l_int32       quality,
00626                  l_uint8     **pdata,
00627                  size_t       *pnbytes,
00628                  l_int32       x,
00629                  l_int32       y,
00630                  l_int32       res,
00631                  L_PDF_DATA  **plpd,
00632                  l_int32       position,
00633                  const char   *title)
00634 {
00635 PIX  *pix;
00636 
00637     PROCNAME("convertToPdfData");
00638 
00639     if (!pdata)
00640         return ERROR_INT("&data not defined", procName, 1);
00641     *pdata = NULL;
00642     if (!pnbytes)
00643         return ERROR_INT("&nbytes not defined", procName, 1);
00644     *pnbytes = 0;
00645     if (!filein)
00646         return ERROR_INT("filein not defined", procName, 1);
00647     if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
00648         type != L_FLATE_ENCODE)
00649         return ERROR_INT("invalid conversion type", procName, 1);
00650 
00651     if ((pix = pixRead(filein)) == NULL)
00652         return ERROR_INT("pix not made", procName, 1);
00653 
00654     pixConvertToPdfData(pix, type, quality, pdata, pnbytes,
00655                         x, y, res, plpd, position, title);
00656     pixDestroy(&pix);
00657     return 0;
00658 }
00659 
00660 
00661 /*!
00662  *  convertImageDataToPdfData()
00663  *
00664  *      Input:  imdata (array of formatted image data; e.g., png, jpeg)
00665  *              size (size of image data)
00666  *              type (L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE)
00667  *              quality (used for JPEG only; 0 for default (75))
00668  *              &data (<return> pdf data in memory)
00669  *              &nbytes (<return> number of bytes in pdf data)
00670  *              x, y (location of lower-left corner of image, in pixels,
00671  *                    relative to the PostScript origin (0,0) at
00672  *                    the lower-left corner of the page)
00673  *              res (override the resolution of the input image, in ppi;
00674  *                   use 0 to respect the resolution embedded in the input)
00675  *              &lpd (ptr to lpd, which is created on the first invocation
00676  *                    and returned until last image is processed, at which
00677  *                    time it is destroyed)
00678  *              position (in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
00679  *                       L_LAST_IMAGE)
00680  *              title (<optional> pdf title; taken from the first image
00681  *                     placed on a page; e.g., an input image filename)
00682  *      Return: 0 if OK, 1 on error
00683  *
00684  *  Notes:
00685  *      (1) If @res == 0 and the input resolution field is 0,
00686  *          this will use DEFAULT_INPUT_RES.
00687  *      (2) See comments in convertToPdf().
00688  */
00689 l_int32
00690 convertImageDataToPdfData(l_uint8      *imdata,
00691                           size_t        size,
00692                           l_int32       type,
00693                           l_int32       quality,
00694                           l_uint8     **pdata,
00695                           size_t       *pnbytes,
00696                           l_int32       x,
00697                           l_int32       y,
00698                           l_int32       res,
00699                           L_PDF_DATA  **plpd,
00700                           l_int32       position,
00701                           const char   *title)
00702 {
00703 l_int32  ret;
00704 PIX     *pix;
00705 
00706     PROCNAME("convertImageDataToPdfData");
00707 
00708     if (!imdata)
00709         return ERROR_INT("image data not defined", procName, 1);
00710     if (!pdata)
00711         return ERROR_INT("&data not defined", procName, 1);
00712     *pdata = NULL;
00713     if (!pnbytes)
00714         return ERROR_INT("&nbytes not defined", procName, 1);
00715     *pnbytes = 0;
00716     if (plpd) {  /* part of multi-page invocation */
00717         if (position == L_FIRST_IMAGE)
00718             *plpd = NULL;
00719     }
00720 
00721     if ((pix = pixReadMem(imdata, size)) == NULL)
00722         return ERROR_INT("pix not read", procName, 1);
00723     ret = pixConvertToPdfData(pix, type, quality, pdata, pnbytes,
00724                               x, y, res, plpd, position, title);
00725     pixDestroy(&pix);
00726     return ret;
00727 }
00728 
00729 
00730 /*!
00731  *  pixConvertToPdf()
00732  *
00733  *      Input:  pix
00734  *              type (L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE)
00735  *              quality (used for JPEG only; 0 for default (75))
00736  *              fileout (output pdf file; only required on last image on page)
00737  *              x, y (location of lower-left corner of image, in pixels,
00738  *                    relative to the PostScript origin (0,0) at
00739  *                    the lower-left corner of the page)
00740  *              res (override the resolution of the input image, in ppi;
00741  *                   use 0 to respect the resolution embedded in the input)
00742  *              &lpd (ptr to lpd, which is created on the first invocation
00743  *                    and returned until last image is processed)
00744  *              position (in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
00745  *                       L_LAST_IMAGE)
00746  *              title (<optional> pdf title; taken from the first image
00747  *                     placed on a page; e.g., an input image filename)
00748  *      Return: 0 if OK, 1 on error
00749  *
00750  *  Notes:
00751  *      (1) If @res == 0 and the input resolution field is 0,
00752  *          this will use DEFAULT_INPUT_RES.
00753  *      (2) This only writes data to fileout if it is the last
00754  *          image to be written on the page.
00755  *      (3) See comments in convertToPdf().
00756  */
00757 l_int32
00758 pixConvertToPdf(PIX          *pix,
00759                 l_int32       type,
00760                 l_int32       quality,
00761                 const char   *fileout,
00762                 l_int32       x,
00763                 l_int32       y,
00764                 l_int32       res,
00765                 L_PDF_DATA  **plpd,
00766                 l_int32       position,
00767                 const char   *title)
00768 {
00769 l_uint8  *data;
00770 l_int32   ret;
00771 size_t    nbytes;
00772 
00773     PROCNAME("pixConvertToPdf");
00774 
00775     if (!pix)
00776         return ERROR_INT("pix not defined", procName, 1);
00777     if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
00778         type != L_FLATE_ENCODE)
00779         return ERROR_INT("invalid conversion type", procName, 1);
00780     if (!plpd || (position == L_LAST_IMAGE)) {
00781         if (!fileout)
00782             return ERROR_INT("fileout not defined", procName, 1);
00783     }
00784 
00785     if (pixConvertToPdfData(pix, type, quality, &data, &nbytes,
00786                             x, y, res, plpd, position, title))
00787         return ERROR_INT("pdf data not made", procName, 1);
00788 
00789     if (!plpd || (position == L_LAST_IMAGE)) {
00790         ret = l_binaryWrite(fileout, "w", data, nbytes);
00791         FREE(data);
00792         if (ret)
00793             return ERROR_INT("pdf data not written to file", procName, 1);
00794     }
00795     return 0;
00796 }
00797 
00798 
00799 /*!
00800  *  pixConvertToPdfData()
00801  *
00802  *      Input:  pix (all depths; cmap OK)
00803  *              type (L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE)
00804  *              quality (used for JPEG only; 0 for default (75))
00805  *              &data (<return> pdf array)
00806  *              &nbytes (<return> number of bytes in pdf array)
00807  *              x, y (location of lower-left corner of image, in pixels,
00808  *                    relative to the PostScript origin (0,0) at
00809  *                    the lower-left corner of the page)
00810  *              res (override the resolution of the input image, in ppi;
00811  *                   use 0 to respect the resolution embedded in the input)
00812  *              &lpd (ptr to lpd, which is created on the first invocation
00813  *                    and returned until last image is processed)
00814  *              position (in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
00815  *                       L_LAST_IMAGE)
00816  *              title (<optional> pdf title; taken from the first image
00817  *                     placed on a page; e.g., an input image filename)
00818  *      Return: 0 if OK, 1 on error
00819  *
00820  *  Notes:
00821  *      (1) If @res == 0 and the input resolution field is 0,
00822  *          this will use DEFAULT_INPUT_RES.
00823  *      (2) This only writes @data if it is the last image to be
00824  *          written on the page.
00825  *      (3) See comments in convertToPdf().
00826  */
00827 l_int32
00828 pixConvertToPdfData(PIX          *pix,
00829                     l_int32       type,
00830                     l_int32       quality,
00831                     l_uint8     **pdata,
00832                     size_t       *pnbytes,
00833                     l_int32       x,
00834                     l_int32       y,
00835                     l_int32       res,
00836                     L_PDF_DATA  **plpd,
00837                     l_int32       position,
00838                     const char   *title)
00839 {
00840 l_int32             pixres, w, h, d, ret;
00841 l_float32           xpt, ypt, wpt, hpt;
00842 L_COMPRESSED_DATA  *cid = NULL;
00843 L_PDF_DATA         *lpd = NULL;
00844 PIXCMAP            *cmap;
00845 
00846     PROCNAME("pixConvertToPdfData");
00847 
00848     if (!pdata)
00849         return ERROR_INT("&data not defined", procName, 1);
00850     *pdata = NULL;
00851     if (!pnbytes)
00852         return ERROR_INT("&nbytes not defined", procName, 1);
00853     *pnbytes = 0;
00854     if (!pix)
00855         return ERROR_INT("pix not defined", procName, 1);
00856     if (plpd) {  /* part of multi-page invocation */
00857         if (position == L_FIRST_IMAGE)
00858             *plpd = NULL;
00859     }
00860 
00861         /* Sanity check on requested encoding */
00862     d = pixGetDepth(pix);
00863     cmap = pixGetColormap(pix);
00864     if (cmap && type != L_FLATE_ENCODE) {
00865         L_WARNING("pix has cmap; using flate encoding", procName);
00866         type = L_FLATE_ENCODE;
00867     }
00868     else if (d < 8 && type == L_JPEG_ENCODE) {
00869         L_WARNING("pix has < 8 bpp; using flate encoding", procName);
00870         type = L_FLATE_ENCODE;
00871     }
00872     else if (d > 1 && type == L_G4_ENCODE) {
00873         L_WARNING("pix has > 1 bpp; using flate encoding", procName);
00874         type = L_FLATE_ENCODE;
00875     }
00876 
00877     if (type == L_JPEG_ENCODE) {
00878         if ((cid = pixGenerateJpegData(pix, 0, quality)) == NULL)
00879             return ERROR_INT("jpeg data not made", procName, 1);
00880         pixres = cid->res;
00881         w = cid->w;
00882         h = cid->h;
00883     }
00884     else if (type == L_G4_ENCODE) {
00885         if ((cid = pixGenerateG4Data(pix, 0)) == NULL)
00886             return ERROR_INT("g4 data not made", procName, 1);
00887         pixres = cid->res;
00888         w = cid->w;
00889         h = cid->h;
00890     }
00891     else if (type == L_FLATE_ENCODE) {
00892         if ((cid = pixGenerateFlateData(pix, 0)) == NULL)
00893             return ERROR_INT("flate data not made", procName, 1);
00894         pixres = cid->res;
00895         w = cid->w;
00896         h = cid->h;
00897     }
00898     else
00899         return ERROR_INT("invalid conversion type", procName, 1);
00900 
00901         /* Get media box in pts.  Guess the input image resolution
00902          * based on the input parameter @res, the resolution data in
00903          * the pix, and the size of the image. */
00904     if (res <= 0.0) {
00905         if (pixres > 0)
00906             res = pixres;
00907         else
00908             res = DEFAULT_INPUT_RES;
00909     }
00910     xpt = x * 72. / res;
00911     ypt = y * 72. / res;
00912     wpt = w * 72. / res;
00913     hpt = h * 72. / res;
00914 
00915         /* Set up lpd */
00916     if (!plpd) {  /* single image */
00917         if ((lpd = pdfdataCreate(title)) == NULL)
00918             return ERROR_INT("lpd not made", procName, 1);
00919     }
00920     else if (position == L_FIRST_IMAGE) {  /* first of multiple images */
00921         if ((lpd = pdfdataCreate(title)) == NULL)
00922             return ERROR_INT("lpd not made", procName, 1);
00923         *plpd = lpd;
00924     }
00925     else  /* not the first of multiple images */
00926         lpd = *plpd;
00927 
00928         /* Add the data to the lpd */
00929     ptraAdd(lpd->cida, cid);
00930     lpd->n++;
00931     ptaAddPt(lpd->xy, xpt, ypt);
00932     ptaAddPt(lpd->wh, wpt, hpt);
00933 
00934         /* If a single image or the last of multiple images,
00935          * generate the pdf and destroy the lpd */
00936     if (!plpd || (position == L_LAST_IMAGE)) {
00937         ret = l_generatePdf(pdata, pnbytes, lpd);
00938         pdfdataDestroy(&lpd);
00939         if (plpd) *plpd = NULL;
00940         if (ret)
00941             return ERROR_INT("pdf output not made", procName, 1);
00942     }
00943 
00944     return 0;
00945 }
00946 
00947 
00948 /*!
00949  *  pixWriteStreamPdf()
00950  *
00951  *      Input:  fp (stream opened for writing)
00952  *              pix (all depths, cmap OK)
00953  *              res (override the resolution of the input image, in ppi;
00954  *                   use 0 to respect the resolution embedded in the input)
00955  *              title (<optional> pdf title; taken from the first image
00956  *                     placed on a page; e.g., an input image filename)
00957  *      Return: 0 if OK, 1 on error
00958  *
00959  *  Notes:
00960  *      (1) This is the simplest interface for writing a single image
00961  *          with pdf encoding.  It uses G4 encoding for 1 bpp,
00962  *          JPEG encoding for 8 bpp (no cmap) and 32 bpp, and FLATE
00963  *          encoding for everything else.
00964  */
00965 l_int32
00966 pixWriteStreamPdf(FILE        *fp,
00967                   PIX         *pix,
00968                   l_int32      res,
00969                   const char  *title)
00970 {
00971 l_uint8  *data;
00972 l_int32   ret, d, type;
00973 size_t    nbytes;
00974 PIXCMAP  *cmap;
00975 
00976     PROCNAME("pixWriteStreamPdf");
00977 
00978     if (!fp)
00979         return ERROR_INT("stream not opened", procName, 1);
00980     if (!pix)
00981         return ERROR_INT("pix not defined", procName, 1);
00982 
00983     d = pixGetDepth(pix);
00984     cmap = pixGetColormap(pix);
00985     if (d == 1)
00986         type = L_G4_ENCODE;
00987     else if (cmap || d == 2 || d == 4 || d == 16)
00988         type = L_FLATE_ENCODE;
00989     else  /* d == 8 (no cmap) or d == 32 */
00990         type = L_JPEG_ENCODE;
00991     if (pixConvertToPdfData(pix, type, 75, &data, &nbytes,
00992                             0, 0, res, NULL, 0, title))
00993         return ERROR_INT("pdf data not made", procName, 1);
00994     ret = fwrite(data, 1, nbytes, fp);
00995 
00996     FREE(data);
00997     if (ret)
00998         return ERROR_INT("pdf data not written to stream", procName, 1);
00999     return 0;
01000 }
01001 
01002 
01003 /*---------------------------------------------------------------------*
01004  *            Segmented multi-page, multi-image converter              *
01005  *---------------------------------------------------------------------*/
01006 /*!
01007  *  convertSegmentedFilesToPdf()
01008  *
01009  *      Input:  directory name (containing images)
01010  *              substr (<optional> substring filter on filenames; can be NULL)
01011  *              res (input resolution of all images)
01012  *              type (compression type for non-image regions; the
01013  *                    image regions are always compressed with L_JPEG_ENCODE)
01014  *              thresh (used for converting gray --> 1 bpp with L_G4_ENCODE)
01015  *              boxaa (of image regions)
01016  *              quality (used for JPEG only; 0 for default (75))
01017  *              scalefactor (scaling factor applied to each image region)
01018  *              title (<optional> pdf title; if null, taken from the first
01019  *                     image filename)
01020  *              fileout (pdf file of all images)
01021  *      Return: 0 if OK, 1 on error
01022  *
01023  *  Notes:
01024  *      (1) If @substr is not NULL, only image filenames that contain
01025  *          the substring can be used.  If @substr == NULL, all files
01026  *          in the directory are used.
01027  *      (2) The files in the directory, after optional filtering by
01028  *          the substring, are lexically sorted in increasing order
01029  *          before concatenation.
01030  *      (3) The images are encoded with G4 if 1 bpp; JPEG if 8 bpp without
01031  *          colormap and many colors, or 32 bpp; FLATE for anything else.
01032  *      (4) The boxaa contains one boxa of "image regions" for each
01033  *          image file.  The boxa must all exist, but they can be empty.
01034  *          They must be aligned with the sorted set of images.
01035  *      (5) The scalefactor is applied to each image region.  It is
01036  *          typically < 1.0, to save bytes in the final pdf, because
01037  *          the resolution is often not critical in non-text regions.
01038  *      (6) The non-image regions are automatically scaled up by 2x and
01039  *          thresholded if the encoding type is G4.  If the non-image
01040  *          regions are not encoded with G4, no scaling is performed on them.
01041  */
01042 l_int32
01043 convertSegmentedFilesToPdf(const char  *dirname,
01044                            const char  *substr,
01045                            l_int32      res,
01046                            l_int32      type,
01047                            l_int32      thresh,
01048                            BOXAA       *baa,
01049                            l_int32      quality,
01050                            l_float32    scalefactor,
01051                            const char  *title,
01052                            const char  *fileout)
01053 {
01054 char     *fname;
01055 l_uint8  *imdata, *data;
01056 l_int32   i, npages, nboxa, nboxes, ret;
01057 size_t    imbytes, databytes;
01058 BOXA     *boxa;
01059 L_BYTEA  *ba;
01060 L_PTRA   *pa_data;
01061 SARRAY   *sa;
01062 
01063     PROCNAME("convertSegmentedFilesToPdf");
01064 
01065     if (!dirname)
01066         return ERROR_INT("dirname not defined", procName, 1);
01067     if (!baa)
01068         return ERROR_INT("baa not defined", procName, 1);
01069     if (!fileout)
01070         return ERROR_INT("fileout not defined", procName, 1);
01071 
01072     if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
01073         return ERROR_INT("sa not made", procName, 1);
01074 
01075         /* Generate and save all the encoded pdf strings */
01076     npages = sarrayGetCount(sa);
01077     nboxa = boxaaGetCount(baa);
01078     if (npages != nboxa) {
01079         sarrayDestroy(&sa);
01080         return ERROR_INT("npages != nboxa", procName, 1);
01081     }
01082     pa_data = ptraCreate(npages);
01083     for (i = 0; i < npages; i++) {
01084         fname = sarrayGetString(sa, i, L_NOCOPY);
01085         boxa = boxaaGetBoxa(baa, i, L_CLONE);
01086         nboxes = boxaGetCount(boxa);
01087         if (nboxes == 0)
01088             boxaDestroy(&boxa);
01089         ret = convertToPdfDataSegmented(fname, res, type, thresh, boxa,
01090                                         quality, scalefactor,
01091                                         &imdata, &imbytes);
01092         boxaDestroy(&boxa);  /* safe; in case nboxes > 0 */
01093         if (ret) {
01094             L_ERROR_STRING("pdf encoding failed for %s", procName, fname);
01095             continue;
01096         }
01097         ba = l_byteaInitFromMem(imdata, imbytes);
01098         if (imdata) FREE(imdata);
01099         ptraAdd(pa_data, ba);
01100     }
01101     sarrayDestroy(&sa);
01102 
01103     ptraGetActualCount(pa_data, &npages);
01104     if (npages == 0) {
01105         L_ERROR("no pdf files made", procName);
01106         ptraDestroy(&pa_data, FALSE, FALSE);
01107         return 1;
01108     }
01109 
01110         /* Concatenate */
01111     ret = ptraConcatenatePdfToData(pa_data, NULL, &data, &databytes);
01112 
01113         /* Clean up */
01114     ptraGetActualCount(pa_data, &npages);  /* recalculate in case it changes */
01115     for (i = 0; i < npages; i++) {
01116         ba = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
01117         l_byteaDestroy(&ba);
01118     }
01119     ptraDestroy(&pa_data, FALSE, FALSE);
01120 
01121     if (ret) {
01122         if (data) FREE(data);
01123         return ERROR_INT("pdf data not made", procName, 1);
01124     }
01125 
01126     ret = l_binaryWrite(fileout, "w", data, databytes);
01127     FREE(data);
01128     if (ret)
01129         L_ERROR("pdf data not written to file", procName);
01130     return ret;
01131 }
01132 
01133 
01134 /*---------------------------------------------------------------------*
01135  *            Segmented single page, multi-image converters            *
01136  *---------------------------------------------------------------------*/
01137 /*!
01138  *  convertToPdfSegmented()
01139  *
01140  *      Input:  filein (input image file -- any format)
01141  *              res (input image resolution; typ. 300 ppi; use 0 for default)
01142  *              type (compression type for non-image regions; the
01143  *                    image regions are always compressed with L_JPEG_ENCODE)
01144  *              thresh (used for converting gray --> 1 bpp with L_G4_ENCODE)
01145  *              boxa (of image regions; can be null)
01146  *              quality (used for jpeg image regions; 0 for default)
01147  *              scalefactor (used for jpeg regions; must be <= 1.0)
01148  *              fileout (output pdf file)
01149  *      Return: 0 if OK, 1 on error
01150  *
01151  *  Notes:
01152  *      (1) If there are no image regions, set @boxa == NULL;
01153  *          @quality and @scalefactor are ignored.
01154  *      (2) Typically, @scalefactor is < 1.0, because the image regions
01155  *          can be rendered at a lower resolution (for better compression)
01156  *          than the text regions.  If @scalefactor == 0, we use 1.0.
01157  *          If the input image is 1 bpp and scalefactor < 1.0, we
01158  *          use scaleToGray() to downsample the image regions to gray
01159  *          before compressing them.
01160  *      (3) If the compression type for non-image regions is L_G4_ENCODE
01161  *          and bpp > 1, the image is upscaled 2x and thresholded
01162  *          to 1 bpp.  That is the only situation where @thresh is used.
01163  *      (4) The parameter @quality is only used for image regions.
01164  *          If @type == L_JPEG_ENCODE, default jpeg quality (75) is
01165  *          used for the non-image regions.
01166  *      (5) Processing matrix for non-image regions.
01167  *
01168  *          Input           G4              JPEG                FLATE
01169  *          ----------|---------------------------------------------------
01170  *          1 bpp     |  1x, 1 bpp       1x flate, 1 bpp     1x, 1 bpp
01171  *                    |
01172  *          cmap      |  2x, 1 bpp       1x flate, cmap      1x, cmap
01173  *                    |
01174  *          2,4 bpp   |  2x, 1 bpp       1x flate            1x, 2,4 bpp
01175  *          no cmap   |                  2,4 bpp
01176  *                    |
01177  *          8,32 bpp  |  2x, 1 bpp       1x (jpeg)           1x, 8,32 bpp
01178  *          no cmap   |                  8,32 bpp
01179  *
01180  *          Summary:
01181  *          (a) if G4 is requested, G4 is used, with 2x upscaling
01182  *              for all cases except 1 bpp.
01183  *          (b) if JPEG is requested, use flate encoding for all cases
01184  *              except 8 bpp without cmap and 32 bpp (rgb).
01185  *          (c) if FLATE is requested, use flate with no transformation
01186  *              of the raster data.
01187  *      (6) Calling options/sequence for these functions:
01188  *              file  -->  file      (convertToPdfSegmented)
01189  *                  pix  -->  file      (pixConvertToPdfSegmented)
01190  *                      pix  -->  data      (pixConvertToPdfDataSegmented)
01191  *              file  -->  data      (convertToPdfDataSegmented)
01192  *                      pix  -->  data      (pixConvertToPdfDataSegmented)
01193  */
01194 l_int32
01195 convertToPdfSegmented(const char  *filein,
01196                       l_int32      res,
01197                       l_int32      type,
01198                       l_int32      thresh,
01199                       BOXA        *boxa,
01200                       l_int32      quality,
01201                       l_float32    scalefactor,
01202                       const char  *fileout)
01203 {
01204 l_int32  ret;
01205 PIX     *pixs;
01206 
01207     PROCNAME("convertToPdfSegmented");
01208 
01209     if (!filein)
01210         return ERROR_INT("filein not defined", procName, 1);
01211     if (!fileout)
01212         return ERROR_INT("fileout not defined", procName, 1);
01213     if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
01214         type != L_FLATE_ENCODE)
01215         return ERROR_INT("invalid conversion type", procName, 1);
01216     if (boxa && scalefactor > 1.0) {
01217         L_WARNING("setting scalefactor to 1.0", procName);
01218         scalefactor = 1.0;
01219     }
01220 
01221     if ((pixs = pixRead(filein)) == NULL)
01222         return ERROR_INT("pixs not made", procName, 1);
01223 
01224     ret = pixConvertToPdfSegmented(pixs, res, type, thresh, boxa, quality,
01225                                    scalefactor, fileout, filein);
01226     pixDestroy(&pixs);
01227     return ret;
01228 }
01229 
01230 
01231 /*!
01232  *  pixConvertToPdfSegmented()
01233  *
01234  *      Input:  pixs (any depth, cmap OK)
01235  *              res (input image resolution; typ. 300 ppi; use 0 for default)
01236  *              type (compression type for non-image regions; the
01237  *                    image regions are always compressed with L_JPEG_ENCODE)
01238  *              thresh (used for converting gray --> 1 bpp with L_G4_ENCODE)
01239  *              boxa (of image regions; can be null)
01240  *              quality (used for jpeg image regions; 0 for default)
01241  *              scalefactor (used for jpeg regions; must be <= 1.0)
01242  *              fileout (output pdf file)
01243  *              title (<optional> pdf title; typically taken from the
01244  *                     input file for the pix)
01245  *      Return: 0 if OK, 1 on error
01246  *
01247  *  Notes:
01248  *      (1) See convertToPdfSegmented() for details.
01249  */
01250 l_int32
01251 pixConvertToPdfSegmented(PIX         *pixs,
01252                          l_int32      res,
01253                          l_int32      type,
01254                          l_int32      thresh,
01255                          BOXA        *boxa,
01256                          l_int32      quality,
01257                          l_float32    scalefactor,
01258                          const char  *fileout,
01259                          const char  *title)
01260 {
01261 l_uint8  *data;
01262 l_int32   ret;
01263 size_t    nbytes;
01264 
01265     PROCNAME("pixConvertToPdfSegmented");
01266 
01267     if (!pixs)
01268         return ERROR_INT("pixs not defined", procName, 1);
01269     if (!fileout)
01270         return ERROR_INT("fileout not defined", procName, 1);
01271     if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
01272         type != L_FLATE_ENCODE)
01273         return ERROR_INT("invalid conversion type", procName, 1);
01274     if (boxa && scalefactor > 1.0) {
01275         L_WARNING("setting scalefactor to 1.0", procName);
01276         scalefactor = 1.0;
01277     }
01278 
01279     ret = pixConvertToPdfDataSegmented(pixs, res, type, thresh, boxa, quality,
01280                                        scalefactor, &data, &nbytes, title);
01281     if (ret)
01282         return ERROR_INT("pdf generation failure", procName, 1);
01283 
01284     ret = l_binaryWrite(fileout, "w", data, nbytes);
01285     if (data) FREE(data);
01286     return ret;
01287 }
01288 
01289 
01290 /*!
01291  *  convertToPdfDataSegmented()
01292  *
01293  *      Input:  filein (input image file -- any format)
01294  *              res (input image resolution; typ. 300 ppi; use 0 for default)
01295  *              type (compression type for non-image regions; the
01296  *                    image regions are always compressed with L_JPEG_ENCODE)
01297  *              thresh (used for converting gray --> 1 bpp with L_G4_ENCODE)
01298  *              boxa (of image regions; can be null)
01299  *              quality (used for jpeg image regions; 0 for default)
01300  *              scalefactor (used for jpeg regions; must be <= 1.0)
01301  *              &data (<return> pdf data in memory)
01302  *              &nbytes (<return> number of bytes in pdf data)
01303  *      Return: 0 if OK, 1 on error
01304  *
01305  *  Notes:
01306  *      (1) If there are no image regions, set @boxa == NULL;
01307  *          @quality and @scalefactor are ignored.
01308  *      (2) Typically, @scalefactor is < 1.0.  The image regions are
01309  */
01310 l_int32
01311 convertToPdfDataSegmented(const char  *filein,
01312                           l_int32      res,
01313                           l_int32      type,
01314                           l_int32      thresh,
01315                           BOXA        *boxa,
01316                           l_int32      quality,
01317                           l_float32    scalefactor,
01318                           l_uint8    **pdata,
01319                           size_t      *pnbytes)
01320 {
01321 l_int32  ret;
01322 PIX     *pixs;
01323 
01324     PROCNAME("convertToPdfDataSegmented");
01325 
01326     if (!pdata)
01327         return ERROR_INT("&data not defined", procName, 1);
01328     *pdata = NULL;
01329     if (!pnbytes)
01330         return ERROR_INT("&nbytes not defined", procName, 1);
01331     *pnbytes = 0;
01332     if (!filein)
01333         return ERROR_INT("filein not defined", procName, 1);
01334     if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
01335         type != L_FLATE_ENCODE)
01336         return ERROR_INT("invalid conversion type", procName, 1);
01337     if (boxa && scalefactor > 1.0) {
01338         L_WARNING("setting scalefactor to 1.0", procName);
01339         scalefactor = 1.0;
01340     }
01341 
01342     if ((pixs = pixRead(filein)) == NULL)
01343         return ERROR_INT("pixs not made", procName, 1);
01344 
01345     ret = pixConvertToPdfDataSegmented(pixs, res, type, thresh, boxa,
01346                                        quality, scalefactor,
01347                                        pdata, pnbytes, filein);
01348     pixDestroy(&pixs);
01349     return ret;
01350 }
01351 
01352 
01353 /*!
01354  *  pixConvertToPdfDataSegmented()
01355  *
01356  *      Input:  pixs (any depth, cmap OK)
01357  *              res (input image resolution; typ. 300 ppi; use 0 for default)
01358  *              type (compression type for non-image regions; the
01359  *                    image regions are always compressed with L_JPEG_ENCODE)
01360  *              thresh (used for converting gray --> 1 bpp with L_G4_ENCODE)
01361  *              boxa (of image regions; can be null)
01362  *              quality (used for jpeg image regions; 0 for default)
01363  *              scalefactor (used for jpeg regions; must be <= 1.0)
01364  *              &data (<return> pdf data in memory)
01365  *              &nbytes (<return> number of bytes in pdf data)
01366  *              title (<optional> pdf title; typically taken from the
01367  *                     input file for the pix)
01368  *      Return: 0 if OK, 1 on error
01369  *
01370  *  Notes:
01371  *      (1) See convertToPdfSegmented() for details.
01372  */
01373 l_int32
01374 pixConvertToPdfDataSegmented(PIX         *pixs,
01375                              l_int32      res,
01376                              l_int32      type,
01377                              l_int32      thresh,
01378                              BOXA        *boxa,
01379                              l_int32      quality,
01380                              l_float32    scalefactor,
01381                              l_uint8    **pdata,
01382                              size_t      *pnbytes,
01383                              const char  *title)
01384 {
01385 l_int32      i, nbox, seq, bx, by, bw, bh, upscale;
01386 l_float32    scale;
01387 BOX         *box, *boxc, *box2;
01388 PIX         *pix, *pixt1, *pixt2, *pixt3, *pixt4, *pixt5, *pixt6;
01389 PIXCMAP     *cmap;
01390 L_PDF_DATA  *lpd;
01391 
01392     PROCNAME("pixConvertToPdfDataSegmented");
01393 
01394     if (!pdata)
01395         return ERROR_INT("&data not defined", procName, 1);
01396     *pdata = NULL;
01397     if (!pnbytes)
01398         return ERROR_INT("&nbytes not defined", procName, 1);
01399     *pnbytes = 0;
01400     if (!pixs)
01401         return ERROR_INT("pixs not defined", procName, 1);
01402     if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
01403         type != L_FLATE_ENCODE)
01404         return ERROR_INT("invalid conversion type", procName, 1);
01405     if (boxa && (scalefactor <= 0.0 || scalefactor > 1.0)) {
01406         L_WARNING("setting scalefactor to 1.0", procName);
01407         scalefactor = 1.0;
01408     }
01409 
01410         /* Adjust scalefactor so that the product with res gives an integer */
01411     if (res <= 0)
01412         res = DEFAULT_INPUT_RES;
01413     scale = (l_float32)((l_int32)(scalefactor * res + 0.5)) / (l_float32)res;
01414     cmap = pixGetColormap(pixs);
01415 
01416         /* Simple case: single image to be encoded */
01417     if (!boxa || boxaGetCount(boxa) == 0) {
01418         if (pixGetDepth(pixs) > 1 && type == L_G4_ENCODE) {
01419             if (cmap)
01420                 pixt1 = pixRemoveColormap(pixs, REMOVE_CMAP_TO_GRAYSCALE);
01421             else
01422                 pixt1 = pixConvertTo8(pixs, FALSE);
01423             pixt2 = pixScaleGray2xLIThresh(pixt1, thresh);
01424             pixConvertToPdfData(pixt2, type, quality, pdata, pnbytes,
01425                                 0, 0, 2 * res, NULL, 0, title);
01426             pixDestroy(&pixt1);
01427             pixDestroy(&pixt2);
01428         }
01429         else {
01430             pixConvertToPdfData(pixs, type, quality, pdata, pnbytes,
01431                                 0, 0, res, NULL, 0, title);
01432         }
01433         return 0;
01434     }
01435 
01436         /* Multiple images to be encoded.  If @type == L_G4_ENCODE,
01437          * jpeg encode a version of pixs that is blanked in the non-image
01438          * regions, and paint the scaled non-image part onto it through a mask.
01439          * Otherwise, we must put the non-image part down first and
01440          * then render all the image regions separately on top of it,
01441          * at their own resolution. */
01442     pixt1 = pixSetBlackOrWhiteBoxa(pixs, boxa, L_SET_WHITE);  /* non-image */
01443     nbox = boxaGetCount(boxa);
01444     if (type == L_G4_ENCODE) {
01445         pixt2 = pixCreateTemplate(pixs);  /* only image regions */
01446         pixSetBlackOrWhite(pixt2, L_SET_WHITE);
01447         for (i = 0; i < nbox; i++) {
01448              box = boxaGetBox(boxa, i, L_CLONE);
01449              pix = pixClipRectangle(pixs, box, &boxc);
01450              boxGetGeometry(boxc, &bx, &by, &bw, &bh);
01451              pixRasterop(pixt2, bx, by, bw, bh, PIX_SRC, pix, 0, 0);
01452              pixDestroy(&pix);
01453              boxDestroy(&box);
01454              boxDestroy(&boxc);
01455         }
01456         pixt3 = pixRemoveColormap(pixt2, REMOVE_CMAP_BASED_ON_SRC);
01457         if (pixGetDepth(pixt3) == 1)
01458             pixt4 = pixScaleToGray(pixt3, scale);
01459         else
01460             pixt4 = pixScale(pixt3, scale, scale);
01461         pixConvertToPdfData(pixt4, L_JPEG_ENCODE, quality, pdata, pnbytes,
01462                             0, 0, (l_int32)(scale * res),
01463                             &lpd, L_FIRST_IMAGE, title);
01464 
01465         if (pixGetDepth(pixt1) == 1) {
01466             pixt5 = pixClone(pixt1);
01467             upscale = 1;
01468         }
01469         else {
01470             pixt6 = pixConvertTo8(pixt1, 0);
01471             pixt5 = pixScaleGray2xLIThresh(pixt6, thresh);
01472             pixDestroy(&pixt6);
01473             upscale = 2;
01474         }
01475         pixConvertToPdfData(pixt5, L_G4_ENCODE, quality, pdata, pnbytes,
01476                             0, 0, upscale * res, &lpd, L_LAST_IMAGE, title);
01477         pixDestroy(&pixt2);
01478         pixDestroy(&pixt3);
01479         pixDestroy(&pixt4);
01480         pixDestroy(&pixt5);
01481     }
01482     else {
01483             /* Put the non-image part down first.  This is the full
01484                size of the page, so we can use it to find the page
01485                height in pixels, which is required for determining
01486                the LL corner of the image relative to the LL corner
01487                of the page. */
01488         pixConvertToPdfData(pixt1, type, quality, pdata, pnbytes, 0, 0,
01489                             res, &lpd, L_FIRST_IMAGE, title);
01490         for (i = 0; i < nbox; i++) {
01491             box = boxaGetBox(boxa, i, L_CLONE);
01492             pixt2 = pixClipRectangle(pixs, box, &boxc);
01493             pixt3 = pixRemoveColormap(pixt2, REMOVE_CMAP_BASED_ON_SRC);
01494             if (pixGetDepth(pixt3) == 1)
01495                 pixt4 = pixScaleToGray(pixt3, scale);
01496             else
01497                 pixt4 = pixScale(pixt3, scale, scale);
01498             box2 = boxTransform(boxc, 0, 0, scale, scale);
01499             boxGetGeometry(box2, &bx, &by, NULL, &bh);
01500             seq = (i == nbox - 1) ? L_LAST_IMAGE : L_NEXT_IMAGE;
01501             pixConvertToPdfData(pixt4, L_JPEG_ENCODE, quality, pdata, pnbytes,
01502                                 bx, by, (l_int32)(scale * res),
01503                                 &lpd, seq, title);
01504             pixDestroy(&pixt2);
01505             pixDestroy(&pixt3);
01506             pixDestroy(&pixt4);
01507             boxDestroy(&box);
01508             boxDestroy(&boxc);
01509             boxDestroy(&box2);
01510         }
01511     }
01512 
01513     pixDestroy(&pixt1);
01514     return 0;
01515 }
01516 
01517 
01518 /*---------------------------------------------------------------------*
01519  *         Helper functions for generating the output pdf string       *
01520  *---------------------------------------------------------------------*/
01521 /*!
01522  *  l_generatePdf()
01523  *
01524  *      Input:  &data (<return> pdf array)
01525  *              &nbytes (<return> number of bytes in pdf array)
01526  *              lpd (all the required input image data)
01527  *      Return: 0 if OK, 1 on error
01528  *
01529  *  Notes:
01530  *      (1) On error, no data is returned.
01531  *      (2) The objects are:
01532  *            1: Catalog
01533  *            2: Info
01534  *            3: Pages
01535  *            4: Page
01536  *            5: Contents  (rendering command)
01537  *            6 to 6+n-1: n XObjects
01538  *            6+n to 6+n+m-1: m colormaps
01539  */
01540 static l_int32
01541 l_generatePdf(l_uint8    **pdata,
01542               size_t      *pnbytes,
01543               L_PDF_DATA  *lpd)
01544 {
01545     PROCNAME("l_generatePdf");
01546 
01547     if (!pdata)
01548         return ERROR_INT("&data not defined", procName, 1);
01549     *pdata = NULL;
01550     if (!pnbytes)
01551         return ERROR_INT("&nbytes not defined", procName, 1);
01552     *pnbytes = 0;
01553     if (!lpd)
01554         return ERROR_INT("lpd not defined", procName, 1);
01555 
01556     generateFixedStringsPdf(lpd);
01557     generateMediaboxPdf(lpd);
01558     generatePageStringPdf(lpd);
01559     generateContentStringPdf(lpd);
01560     generatePreXStringsPdf(lpd);
01561     generateColormapStringsPdf(lpd);
01562     generateTrailerPdf(lpd);
01563     return generateOutputDataPdf(pdata, pnbytes, lpd);
01564 }
01565 
01566 
01567 static void
01568 generateFixedStringsPdf(L_PDF_DATA  *lpd)
01569 {
01570 char     buf[L_SMALLBUF];
01571 char    *version, *datestr;
01572 SARRAY  *sa;
01573 
01574         /* Accumulate data for the header and objects 1-3 */
01575     lpd->id = stringNew("%PDF-1.2\n");
01576     numaAddNumber(lpd->objsize, strlen(lpd->id));
01577 
01578     lpd->obj1 = stringNew("1 0 obj\n"
01579                           "<<\n"
01580                           "/Type /Catalog\n"
01581                           "/Pages 3 0 R\n"
01582                           ">>\n"
01583                           "endobj\n");
01584     numaAddNumber(lpd->objsize, strlen(lpd->obj1));
01585 
01586     sa = sarrayCreate(0);
01587     sarrayAddString(sa, (char *)"2 0 obj\n"
01588                                  "<<\n", L_COPY);
01589     if (lpd->title) {
01590         snprintf(buf, sizeof(buf), "/Title (%s)\n", lpd->title);
01591         sarrayAddString(sa, (char *)buf, L_COPY);
01592     }
01593     if (var_WRITE_DATE_AND_VERSION) {
01594         version = getLeptonicaVersion();
01595         snprintf(buf, sizeof(buf),
01596                  "/Producer (leptonica: %s)\n", version);
01597         FREE(version);
01598     }
01599     else
01600         snprintf(buf, sizeof(buf), "/Producer (leptonica)\n");
01601     sarrayAddString(sa, (char *)buf, L_COPY);
01602     if (var_WRITE_DATE_AND_VERSION) {
01603         datestr = l_getFormattedDate();
01604         snprintf(buf, sizeof(buf), "/CreationDate (D:%s)\n", datestr);
01605         sarrayAddString(sa, (char *)buf, L_COPY);
01606         FREE(datestr);
01607     }
01608     sarrayAddString(sa, (char *)">>\n"
01609                                 "endobj\n", L_COPY);
01610     lpd->obj2 = sarrayToString(sa, 0);
01611     numaAddNumber(lpd->objsize, strlen(lpd->obj2));
01612     sarrayDestroy(&sa);
01613 
01614     lpd->obj3 = stringNew("3 0 obj\n"
01615                           "<<\n"
01616                           "/Type /Pages\n"
01617                           "/Kids [ 4 0 R ]\n"
01618                           "/Count 1\n"
01619                           ">>\n");
01620     numaAddNumber(lpd->objsize, strlen(lpd->obj3));
01621 
01622         /* Do the post-datastream string */
01623     lpd->poststream = stringNew("\n"
01624                                 "endstream\n"
01625                                 "endobj\n");
01626     return;
01627 }
01628 
01629 
01630 static void
01631 generateMediaboxPdf(L_PDF_DATA  *lpd)
01632 {
01633 l_int32    i;
01634 l_float32  xpt, ypt, wpt, hpt, maxx, maxy;
01635 
01636         /* First get the full extent of all the images.
01637          * This is the mediabox, in pts. */
01638     maxx = maxy = 0;
01639     for (i = 0; i < lpd->n; i++) {
01640         ptaGetPt(lpd->xy, i, &xpt, &ypt);
01641         ptaGetPt(lpd->wh, i, &wpt, &hpt);
01642         maxx = L_MAX(maxx, xpt + wpt);
01643         maxy = L_MAX(maxy, ypt + hpt);
01644     }
01645 
01646     lpd->mediabox = boxCreate(0, 0, (l_int32)(maxx + 0.5),
01647                               (l_int32)(maxy + 0.5));
01648 
01649         /* ypt is in standard image coordinates: the location of
01650          * the UL image corner with respect to the UL media box corner.
01651          * Rewrite each ypt for PostScript coordinates: the location of
01652          * the LL image corner with respect to the LL media box corner. */
01653     for (i = 0; i < lpd->n; i++) {
01654         ptaGetPt(lpd->xy, i, &xpt, &ypt);
01655         ptaGetPt(lpd->wh, i, &wpt, &hpt);
01656         ptaSetPt(lpd->xy, i, xpt, maxy - ypt - hpt);
01657     }
01658 
01659     return;
01660 }
01661 
01662 
01663 static l_int32
01664 generatePageStringPdf(L_PDF_DATA  *lpd)
01665 {
01666 char    *buf;
01667 char    *xstr;
01668 l_int32  bufsize, i, wpt, hpt;
01669 SARRAY  *sa;
01670     
01671     PROCNAME("generatePageStringPdf");
01672 
01673         /* Allocate 1000 bytes for the boilerplate text, and
01674          * 50 bytes for each reference to an image in the
01675          * ProcSet array.  */
01676     bufsize = 1000 + 50 * lpd->n;
01677     if ((buf = (char *)CALLOC(bufsize, sizeof(char))) == NULL)
01678         return ERROR_INT("calloc fail for buf", procName, 1);
01679 
01680     boxGetGeometry(lpd->mediabox, NULL, NULL, &wpt, &hpt);
01681     sa = sarrayCreate(lpd->n);
01682     for (i = 0; i < lpd->n; i++) {
01683         snprintf(buf, bufsize, "/Im%d %d 0 R   ", i + 1, 6 + i);
01684         sarrayAddString(sa, buf, L_COPY);
01685     }
01686     if ((xstr = sarrayToString(sa, 0)) == NULL)
01687         return ERROR_INT("xstr not found", procName, 1);
01688     sarrayDestroy(&sa);
01689         
01690     snprintf(buf, bufsize, "4 0 obj\n"
01691                            "<<\n"
01692                            "/Type /Page\n"
01693                            "/Parent 3 0 R\n"
01694                            "/MediaBox [%d %d %d %d]\n"
01695                            "/Contents 5 0 R\n"
01696                            "/Resources\n"
01697                            "<<\n"
01698                            "/XObject << %s >>\n"
01699                            "/ProcSet [ /ImageB /ImageI /ImageC ]\n"
01700                            ">>\n"
01701                            ">>\n"
01702                            "endobj\n",
01703                            0, 0, wpt, hpt, xstr);
01704 
01705     lpd->obj4 = stringNew(buf);
01706     numaAddNumber(lpd->objsize, strlen(lpd->obj4));
01707     sarrayDestroy(&sa);
01708     FREE(buf);
01709     FREE(xstr);
01710     return 0;
01711 }
01712 
01713 
01714 static l_int32
01715 generateContentStringPdf(L_PDF_DATA  *lpd)
01716 {
01717 char      *buf;
01718 char      *cstr;
01719 l_int32    i, bufsize;
01720 l_float32  xpt, ypt, wpt, hpt;
01721 SARRAY    *sa;
01722     
01723     PROCNAME("generateContentStringPdf");
01724 
01725     bufsize = 1000 + 200 * lpd->n;
01726     if ((buf = (char *)CALLOC(bufsize, sizeof(char))) == NULL)
01727         return ERROR_INT("calloc fail for buf", procName, 1);
01728 
01729     sa = sarrayCreate(lpd->n);
01730     for (i = 0; i < lpd->n; i++) {
01731         ptaGetPt(lpd->xy, i, &xpt, &ypt);
01732         ptaGetPt(lpd->wh, i, &wpt, &hpt);
01733         snprintf(buf, bufsize,
01734                  "q %.4f %.4f %.4f %.4f %.4f %.4f cm /Im%d Do Q\n",
01735                  wpt, 0.0, 0.0, hpt, xpt, ypt, i + 1);
01736         sarrayAddString(sa, buf, L_COPY);
01737     }
01738     if ((cstr = sarrayToString(sa, 0)) == NULL)
01739         return ERROR_INT("cstr not found", procName, 1);
01740     sarrayDestroy(&sa);
01741         
01742     snprintf(buf, bufsize, "5 0 obj\n"
01743                            "<< /Length %d >>\n"
01744                            "stream\n"
01745                            "%s"
01746                            "endstream\n"
01747                            "endobj\n",
01748                            (l_int32)strlen(cstr), cstr);
01749 
01750     lpd->obj5 = stringNew(buf);
01751     numaAddNumber(lpd->objsize, strlen(lpd->obj5));
01752     sarrayDestroy(&sa);
01753     FREE(buf);
01754     FREE(cstr);
01755     return 0;
01756 }
01757 
01758 
01759 static l_int32
01760 generatePreXStringsPdf(L_PDF_DATA  *lpd)
01761 {
01762 char                buff[256];
01763 char                buf[L_BIGBUF];
01764 char               *cstr, *bstr, *fstr, *xstr;
01765 l_int32             i, cmindex;
01766 L_COMPRESSED_DATA  *cid;
01767 SARRAY             *sa;
01768     
01769     PROCNAME("generatePreXStringsPdf");
01770 
01771     sa = lpd->saprex;
01772     cmindex = 6 + lpd->n;  /* starting value */
01773     for (i = 0; i < lpd->n; i++) {
01774         if ((cid = pdfdataGetCid(lpd, i)) == NULL)
01775             return ERROR_INT("cid not found", procName, 1);
01776 
01777         if (cid->type == L_G4_ENCODE) {
01778             if (var_WRITE_G4_IMAGE_MASK) {
01779                 cstr = stringNew("/ImageMask true\n"
01780                                  "/ColorSpace /DeviceGray");
01781             }
01782             else
01783                 cstr = stringNew("/ColorSpace /DeviceGray");
01784             bstr = stringNew("/BitsPerComponent 1\n"
01785                              "/Interpolate true");
01786             snprintf(buff, sizeof(buff),
01787                      "/Filter /CCITTFaxDecode\n"
01788                      "/DecodeParms\n"
01789                      "<<\n"
01790                      "/K -1\n"
01791                      "/Columns %d\n"
01792                      ">>", cid->w);
01793             fstr = stringNew(buff);
01794         }
01795         else if (cid->type == L_JPEG_ENCODE) {
01796             if (cid->spp == 1)
01797                 cstr = stringNew("/ColorSpace /DeviceGray");
01798             else if (cid->spp == 3)
01799                 cstr = stringNew("/ColorSpace /DeviceRGB");
01800             else
01801                 L_ERROR("spp!= 1 && spp != 3", procName);
01802             bstr = stringNew("/BitsPerComponent 8");
01803             fstr = stringNew("/Filter /DCTDecode");
01804         }
01805         else {  /* type == L_FLATE_ENCODE */
01806             if (cid->ncolors > 0) {  /* cmapped */
01807                 snprintf(buff, sizeof(buff), "/ColorSpace %d 0 R", cmindex++);
01808                 cstr = stringNew(buff);
01809             }
01810             else {
01811                 if (cid->spp == 1 && cid->bps == 1)
01812                     cstr = stringNew("/ColorSpace /DeviceGray\n"
01813                                      "/Decode [1 0]");
01814                 else if (cid->spp == 1)  /* 8 bpp */
01815                     cstr = stringNew("/ColorSpace /DeviceGray");
01816                 else if (cid->spp == 3)
01817                     cstr = stringNew("/ColorSpace /DeviceRGB");
01818                 else 
01819                     L_ERROR("unknown colorspace", procName);
01820             }
01821             snprintf(buff, sizeof(buff), "/BitsPerComponent %d", cid->bps);
01822             bstr = stringNew(buff);
01823             fstr = stringNew("/Filter /FlateDecode");
01824         }
01825 
01826         snprintf(buf, sizeof(buf), 
01827                  "%d 0 obj\n"
01828                  "<<\n"
01829                  "/Length %ld\n"
01830                  "/Subtype /Image\n"
01831                  "%s\n"  /* colorspace */
01832                  "/Width %d\n"
01833                  "/Height %d\n"
01834                  "%s\n"  /* bits/component */
01835                  "%s\n"  /* filter */
01836                  ">>\n"
01837                  "stream\n",
01838                  6 + i, cid->nbytescomp, cstr, cid->w, cid->h, bstr, fstr);
01839         xstr = stringNew(buf);
01840         sarrayAddString(sa, xstr, L_INSERT);
01841         numaAddNumber(lpd->objsize,
01842                       strlen(xstr) + cid->nbytescomp + strlen(lpd->poststream));
01843         FREE(cstr);
01844         FREE(bstr);
01845         FREE(fstr);
01846     }
01847 
01848     return 0;
01849 }
01850 
01851 
01852 static l_int32
01853 generateColormapStringsPdf(L_PDF_DATA  *lpd)
01854 {
01855 char                buf[L_BIGBUF];
01856 char               *cmstr;
01857 l_int32             i, cmindex, ncmap;
01858 L_COMPRESSED_DATA  *cid;
01859 SARRAY             *sa;
01860     
01861     PROCNAME("generateColormapStringsPdf");
01862 
01863         /* In our canonical format, we have 5 objects, followed
01864          * by n XObjects, followed by m colormaps, so the index of
01865          * the first colormap object is 6 + n. */
01866     sa = lpd->sacmap;
01867     cmindex = 6 + lpd->n;  /* starting value */
01868     ncmap = 0;
01869     for (i = 0; i < lpd->n; i++) {
01870         if ((cid = pdfdataGetCid(lpd, i)) == NULL)
01871             return ERROR_INT("cid not found", procName, 1);
01872         if (cid->ncolors == 0) continue;
01873 
01874         ncmap++;
01875         snprintf(buf, sizeof(buf), "%d 0 obj\n"
01876                                    "[ /Indexed /DeviceRGB\n"
01877                                    "%d\n"
01878                                    "%s\n"
01879                                    "]\n"
01880                                    "endobj\n",
01881                                    cmindex, cid->ncolors - 1, cid->cmapdatahex);
01882         cmindex++;
01883         cmstr = stringNew(buf);
01884         numaAddNumber(lpd->objsize, strlen(cmstr));
01885         sarrayAddString(sa, cmstr, L_INSERT);
01886     }
01887 
01888     lpd->ncmap = ncmap;
01889     return 0;
01890 }
01891 
01892 
01893 static void
01894 generateTrailerPdf(L_PDF_DATA  *lpd)
01895 {
01896 l_int32  i, n, size, linestart;
01897 NUMA    *naloc, *nasize;
01898 
01899         /* Let nobj be the number of numbered objects.  These numbered
01900          * objects are indexed by their pdf number in arrays naloc[]
01901          * and nasize[].  The 0th object is the 9 byte header.  Then
01902          * the number of objects in nasize, which includes the header,
01903          * is n = nobj + 1.  The array naloc[] has n + 1 elements,
01904          * because it includes as the last element the starting
01905          * location of xref.  The indexing of these objects, their
01906          * starting locations and sizes are:
01907          *
01908          *     Object number         Starting location         Size
01909          *     -------------         -----------------     --------------
01910          *          0                   naloc[0] = 0       nasize[0] = 9
01911          *          1                   naloc[1] = 9       nasize[1] = 49
01912          *          n                   naloc[n]           nasize[n] 
01913          *          xref                naloc[n+1] 
01914          *
01915          * We first generate naloc.
01916          */
01917     nasize = lpd->objsize;
01918     naloc = lpd->objloc;
01919     linestart = 0;
01920     numaAddNumber(naloc, linestart);  /* header */
01921     n = numaGetCount(nasize);
01922     for (i = 0; i < n; i++) {
01923         numaGetIValue(nasize, i, &size);
01924         linestart += size;
01925         numaAddNumber(naloc, linestart);
01926     }
01927     numaGetIValue(naloc, n, &lpd->xrefloc);  /* save it */
01928 
01929         /* Now make the actual trailer string */
01930     lpd->trailer = makeTrailerStringPdf(naloc);
01931 }
01932 
01933 
01934 static char *
01935 makeTrailerStringPdf(NUMA  *naloc)
01936 {
01937 char    *outstr;
01938 char     buf[L_BIGBUF];
01939 l_int32  i, n, linestart, xrefloc;
01940 SARRAY  *sa;
01941 
01942     PROCNAME("makeTrailerStringPdf");
01943 
01944     if (!naloc)
01945         return (char *)ERROR_PTR("naloc not defined", procName, NULL);
01946     n = numaGetCount(naloc) - 1;  /* numbered objects + 1 (yes, +1) */
01947 
01948     sa = sarrayCreate(0);
01949     snprintf(buf, sizeof(buf), "xref\n"
01950                                "0 %d\n"
01951                                "0000000000 65535 f \n", n);
01952     sarrayAddString(sa, (char *)buf, L_COPY);
01953     for (i = 1; i < n; i++) {
01954         numaGetIValue(naloc, i, &linestart);
01955         snprintf(buf, sizeof(buf), "%010d 00000 n \n", linestart);
01956         sarrayAddString(sa, (char *)buf, L_COPY);
01957     }
01958 
01959     numaGetIValue(naloc, n, &xrefloc);
01960     snprintf(buf, sizeof(buf), "trailer\n"
01961                                "<<\n"
01962                                "/Size %d\n"
01963                                "/Root 1 0 R\n"
01964                                "/Info 2 0 R\n"
01965                                ">>\n"
01966                                "startxref\n"
01967                                "%d\n"
01968                                "%%%%EOF\n", n, xrefloc);
01969     sarrayAddString(sa, (char *)buf, L_COPY);
01970     outstr = sarrayToString(sa, 0);
01971     sarrayDestroy(&sa);
01972     return outstr;
01973 }
01974 
01975 
01976 /*!
01977  *  generateOutputDataPdf()
01978  *
01979  *      Input:  &data (<return> pdf data array)
01980  *              &nbytes (<return> size of pdf data array)
01981  *              lpd (input data used to make pdf)
01982  *      Return: 0 if OK, 1 on error
01983  *
01984  *  Notes:
01985  *      (1) Only called from l_generatePdf().  On error, no data is returned.
01986  */
01987 static l_int32
01988 generateOutputDataPdf(l_uint8    **pdata,
01989                       size_t      *pnbytes,
01990                       L_PDF_DATA  *lpd)
01991 {
01992 char               *str;
01993 l_uint8            *data;
01994 l_int32             nimages, i, len;
01995 l_int32            *sizes, *locs;
01996 size_t              nbytes;
01997 L_COMPRESSED_DATA  *cid;
01998 
01999     PROCNAME("generateOutputDataPdf");
02000 
02001     if (!pdata)
02002         return ERROR_INT("&data not defined", procName, 1);
02003     *pdata = NULL;
02004     if (!pnbytes)
02005         return ERROR_INT("&nbytes not defined", procName, 1);
02006     nbytes = lpd->xrefloc + strlen(lpd->trailer);
02007     *pnbytes = nbytes;
02008     if ((data = (l_uint8 *)CALLOC(nbytes, sizeof(l_uint8))) == NULL)
02009         return ERROR_INT("calloc fail for data", procName, 1);
02010     *pdata = data;
02011 
02012     sizes = numaGetIArray(lpd->objsize);
02013     locs = numaGetIArray(lpd->objloc);
02014     memcpy((char *)data, lpd->id, sizes[0]);
02015     memcpy((char *)(data + locs[1]), lpd->obj1, sizes[1]);
02016     memcpy((char *)(data + locs[2]), lpd->obj2, sizes[2]);
02017     memcpy((char *)(data + locs[3]), lpd->obj3, sizes[3]);
02018     memcpy((char *)(data + locs[4]), lpd->obj4, sizes[4]);
02019     memcpy((char *)(data + locs[5]), lpd->obj5, sizes[5]);
02020 
02021         /* Each image has 3 parts: variable preamble, the compressed
02022          * data stream, and the fixed poststream. */
02023     nimages = lpd->n;
02024     for (i = 0; i < nimages; i++) {
02025         if ((cid = pdfdataGetCid(lpd, i)) == NULL)  /* this should not happen */
02026             return ERROR_INT("cid not found", procName, 1);
02027         str = sarrayGetString(lpd->saprex, i, L_NOCOPY);
02028         len = strlen(str);
02029         memcpy((char *)(data + locs[6 + i]), str, len);
02030         memcpy((char *)(data + locs[6 + i] + len),
02031                (char *)cid->datacomp, cid->nbytescomp);
02032         memcpy((char *)(data + locs[6 + i] + len + cid->nbytescomp),
02033                lpd->poststream, strlen(lpd->poststream));
02034     }
02035 
02036         /* Each colormap is simply a stored string */
02037     for (i = 0; i < lpd->ncmap; i++) {
02038         str = sarrayGetString(lpd->sacmap, i, L_NOCOPY);
02039         memcpy((char *)(data + locs[6 + nimages + i]), str, strlen(str));
02040     }
02041 
02042         /* And finally the trailer */
02043     memcpy((char *)(data + lpd->xrefloc), lpd->trailer, strlen(lpd->trailer));
02044     FREE(sizes);
02045     FREE(locs);
02046     return 0;
02047 }
02048 
02049 
02050 /*---------------------------------------------------------------------*
02051  *                         Multi-page concatenation                    *
02052  *---------------------------------------------------------------------*/
02053 /*!
02054  *  concatenatePdf()
02055  *
02056  *      Input:  directory name (containing single-page pdf files)
02057  *              substr (<optional> substring filter on filenames; can be NULL)
02058  *              fileout (concatenated pdf file)
02059  *      Return: 0 if OK, 1 on error
02060  *
02061  *  Notes:
02062  *      (1) This only works with leptonica-formatted single-page pdf files.
02063  *      (2) If @substr is not NULL, only filenames that contain
02064  *          the substring can be returned.  If @substr == NULL,
02065  *          none of the filenames are filtered out.
02066  *      (3) The files in the directory, after optional filtering by
02067  *          the substring, are lexically sorted in increasing order
02068  *          before concatenation.
02069  */
02070 l_int32
02071 concatenatePdf(const char  *dirname,
02072                const char  *substr,
02073                const char  *fileout)
02074 {
02075 l_int32  ret;
02076 SARRAY  *sa;
02077 
02078     PROCNAME("concatenatePdf");
02079 
02080     if (!dirname)
02081         return ERROR_INT("dirname not defined", procName, 1);
02082     if (!fileout)
02083         return ERROR_INT("fileout not defined", procName, 1);
02084 
02085     if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
02086         return ERROR_INT("sa not made", procName, 1);
02087     ret = saConcatenatePdf(sa, fileout);
02088     sarrayDestroy(&sa);
02089     return ret;
02090 }
02091 
02092 
02093 /*!
02094  *  saConcatenatePdf()
02095  *
02096  *      Input:  sarray (of pathnames for single-page pdf files)
02097  *              fileout (concatenated pdf file)
02098  *      Return: 0 if OK, 1 on error
02099  *
02100  *  Notes:
02101  *      (1) This only works with leptonica-formatted single-page pdf files.
02102  */
02103 l_int32
02104 saConcatenatePdf(SARRAY      *sa,
02105                  const char  *fileout)
02106 {
02107 l_uint8  *data;
02108 l_int32   ret;
02109 size_t    nbytes;
02110 
02111     PROCNAME("saConcatenatePdf");
02112 
02113     if (!sa)
02114         return ERROR_INT("sa not defined", procName, 1);
02115     if (!fileout)
02116         return ERROR_INT("fileout not defined", procName, 1);
02117 
02118     ret = saConcatenatePdfToData(sa, &data, &nbytes);
02119     if (ret)
02120         return ERROR_INT("pdf data not made", procName, 1);
02121     ret = l_binaryWrite(fileout, "w", data, nbytes);
02122     FREE(data);
02123     return ret;
02124 }
02125 
02126 
02127 /*!
02128  *  ptraConcatenatePdf()
02129  *
02130  *      Input:  ptra (array of pdf strings, each for a single-page pdf file)
02131  *              fileout (concatenated pdf file)
02132  *      Return: 0 if OK, 1 on error
02133  *
02134  *  Notes:
02135  *      (1) This only works with leptonica-formatted single-page pdf files.
02136  */
02137 l_int32
02138 ptraConcatenatePdf(L_PTRA      *pa,
02139                    const char  *fileout)
02140 {
02141 l_uint8  *data;
02142 l_int32   ret;
02143 size_t    nbytes;
02144 
02145     PROCNAME("ptraConcatenatePdf");
02146 
02147     if (!pa)
02148         return ERROR_INT("pa not defined", procName, 1);
02149     if (!fileout)
02150         return ERROR_INT("fileout not defined", procName, 1);
02151 
02152     ret = ptraConcatenatePdfToData(pa, NULL, &data, &nbytes);
02153     if (ret)
02154         return ERROR_INT("pdf data not made", procName, 1);
02155     ret = l_binaryWrite(fileout, "w", data, nbytes);
02156     FREE(data);
02157     return ret;
02158 }
02159 
02160 
02161 /*!
02162  *  concatenatePdfToData()
02163  *
02164  *      Input:  directory name (containing single-page pdf files)
02165  *              substr (<optional> substring filter on filenames; can be NULL)
02166  *              &data (<return> concatenated pdf data in memory)
02167  *              &nbytes (<return> number of bytes in pdf data)
02168  *      Return: 0 if OK, 1 on error
02169  *
02170  *  Notes:
02171  *      (1) This only works with leptonica-formatted single-page pdf files.
02172  *      (2) If @substr is not NULL, only filenames that contain
02173  *          the substring can be returned.  If @substr == NULL,
02174  *          none of the filenames are filtered out.
02175  *      (3) The files in the directory, after optional filtering by
02176  *          the substring, are lexically sorted in increasing order
02177  *          before concatenation.
02178  */
02179 l_int32
02180 concatenatePdfToData(const char  *dirname,
02181                      const char  *substr,
02182                      l_uint8    **pdata,
02183                      size_t      *pnbytes)
02184 {
02185 l_int32  ret;
02186 SARRAY  *sa;
02187 
02188     PROCNAME("concatenatePdfToData");
02189 
02190     if (!pdata)
02191         return ERROR_INT("&data not defined", procName, 1);
02192     *pdata = NULL;
02193     if (!pnbytes)
02194         return ERROR_INT("&nbytes not defined", procName, 1);
02195     *pnbytes = 0;
02196     if (!dirname)
02197         return ERROR_INT("dirname not defined", procName, 1);
02198 
02199     if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
02200         return ERROR_INT("sa not made", procName, 1);
02201     ret = saConcatenatePdfToData(sa, pdata, pnbytes);
02202     sarrayDestroy(&sa);
02203     return ret;
02204 }
02205 
02206 
02207 /*!
02208  *  saConcatenatePdfToData()
02209  *
02210  *      Input:  sarray (of pathnames for single-page pdf files)
02211  *              &data (<return> concatenated pdf data in memory)
02212  *              &nbytes (<return> number of bytes in pdf data)
02213  *      Return: 0 if OK, 1 on error
02214  *
02215  *  Notes:
02216  *      (1) This only works with leptonica-formatted single-page pdf files.
02217  */
02218 l_int32
02219 saConcatenatePdfToData(SARRAY    *sa,
02220                        l_uint8  **pdata,
02221                        size_t    *pnbytes)
02222 {
02223 char     *fname;
02224 l_int32   i, npages, ret;
02225 L_BYTEA  *bas;
02226 L_PTRA   *pa_data;  /* input pdf data for each page */
02227 
02228     PROCNAME("saConcatenatePdfToData");
02229 
02230     if (!pdata)
02231         return ERROR_INT("&data not defined", procName, 1);
02232     *pdata = NULL;
02233     if (!pnbytes)
02234         return ERROR_INT("&nbytes not defined", procName, 1);
02235     *pnbytes = 0;
02236     if (!sa)
02237         return ERROR_INT("sa not defined", procName, 1);
02238 
02239         /* Read the pdf files into memory */
02240     if ((npages = sarrayGetCount(sa)) == 0)
02241         return ERROR_INT("no filenames found", procName, 1);
02242     pa_data = ptraCreate(npages);
02243     for (i = 0; i < npages; i++) {
02244         fname = sarrayGetString(sa, i, L_NOCOPY);
02245         bas = l_byteaInitFromFile(fname);
02246         ptraAdd(pa_data, bas);
02247     }
02248 
02249     ret = ptraConcatenatePdfToData(pa_data, sa, pdata, pnbytes);
02250 
02251         /* Cleanup: some pages could have been removed */
02252     ptraGetActualCount(pa_data, &npages);
02253     for (i = 0; i < npages; i++) {
02254         bas = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
02255         l_byteaDestroy(&bas);
02256     }
02257     ptraDestroy(&pa_data, FALSE, FALSE);
02258     return ret;
02259 }
02260 
02261 
02262 /*!
02263  *  ptraConcatenatePdfToData()
02264  *
02265  *      Input:  ptra (array of pdf strings, each for a single-page pdf file)
02266  *              sarray (<optional> of pathnames for input pdf files)
02267  *              &data (<return> concatenated pdf data in memory)
02268  *              &nbytes (<return> number of bytes in pdf data)
02269  *      Return: 0 if OK, 1 on error
02270  *
02271  *  Notes:
02272  *      (1) This only works with leptonica-formatted single-page pdf files.
02273  *          pdf files generated by other programs will have unpredictable
02274  *          (and usually bad) results.  The requirements for each pdf file:
02275  *            (a) The Catalog and Info objects are the first two.
02276  *            (b) Object 3 is Pages
02277  *            (c) Object 4 is Page
02278  *            (d) The remaining objects are Contents, XObjects, and ColorSpace
02279  *      (2) We remove trailers from each page, and append the full trailer
02280  *          for all pages at the end.
02281  *      (3) For all but the first file, remove the ID and the first 3
02282  *          objects (catalog, info, pages), so that each subsequent
02283  *          file has only objects of these classes:
02284  *              Page, Contents, XObject, ColorSpace (Indexed RGB).
02285  *          For those objects, we substitute these refs to objects
02286  *          in the local file:
02287  *              Page:  Parent(object 3), Contents, XObject(typically multiple)
02288  *              XObject:  [ColorSpace if indexed]
02289  *          The Pages object on the first page (object 3) has a Kids array
02290  *          of references to all the Page objects, with a Count equal
02291  *          to the number of pages.  Each Page object refers back to
02292  *          this parent.
02293  */
02294 l_int32
02295 ptraConcatenatePdfToData(L_PTRA    *pa_data,
02296                          SARRAY    *sa,
02297                          l_uint8  **pdata,
02298                          size_t    *pnbytes)
02299 {
02300 char     *fname, *str_pages, *str_trailer;
02301 l_uint8  *pdfdata, *data;
02302 l_int32   i, j, index, nobj, npages;
02303 l_int32  *sizes, *locs;
02304 size_t    size;
02305 L_BYTEA  *bas, *bad, *bat1, *bat2;
02306 NUMA     *na, *na_locs, *na_objs, *napage, *na_sizes, *na_outlocs;
02307 NUMAA    *naa_locs;  /* object locations on each page */
02308 NUMAA    *naa_objs;  /* object mapping numbers to new values */
02309 
02310     PROCNAME("ptraConcatenatePdfToData");
02311 
02312     if (!pdata)
02313         return ERROR_INT("&data not defined", procName, 1);
02314     *pdata = NULL;
02315     if (!pnbytes)
02316         return ERROR_INT("&nbytes not defined", procName, 1);
02317     *pnbytes = 0;
02318     if (!pa_data)
02319         return ERROR_INT("pa_data not defined", procName, 1);
02320 
02321         /* Parse the files and find the object locations.
02322          * Remove file data that cannot be parsed. */
02323     ptraGetActualCount(pa_data, &npages);
02324     naa_locs = numaaCreate(npages);
02325     for (i = 0; i < npages; i++) {
02326         bas = (L_BYTEA *)ptraGetHandle(pa_data, i);
02327         if (parseTrailerPdf(bas, &na_locs) != 0) {
02328             bas = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
02329             l_byteaDestroy(&bas);
02330             if (sa) {
02331                 fname = sarrayGetString(sa, i, L_NOCOPY);
02332                 L_ERROR_STRING("can't parse file %s; skipping",
02333                                procName, fname);
02334             }
02335             else {
02336                 L_ERROR_INT("can't parse file %d; skipping", procName, i);
02337             }
02338         }
02339         else {
02340             numaaAddNuma(naa_locs, na_locs, L_INSERT);
02341         }
02342     }
02343 
02344         /* Recompute npages in case some of the files were not pdf */
02345     ptraCompactArray(pa_data);
02346     ptraGetActualCount(pa_data, &npages);
02347     if (npages == 0) {
02348         numaaDestroy(&naa_locs);
02349         return ERROR_INT("no parsable pdf files found", procName, 1);
02350     }
02351 
02352         /* Find the mapping from initial to final object numbers */
02353     naa_objs = numaaCreate(npages);  /* stores final object numbers */
02354     napage = numaCreate(npages);  /* stores "Page" object numbers */
02355     index = 0;
02356     for (i = 0; i < npages; i++) {
02357         na = numaaGetNuma(naa_locs, i, L_CLONE);
02358         nobj = numaGetCount(na);
02359         if (i == 0) {
02360             numaAddNumber(napage, 4);  /* object 4 on first page */
02361             na_objs = numaMakeSequence(0.0, 1.0, nobj - 1);
02362             index = nobj - 1;
02363         }
02364         else {  /* skip the first 3 objects in each file */
02365             numaAddNumber(napage, index);  /* Page object is first we add */
02366             na_objs = numaMakeConstant(0.0, nobj - 1);
02367             numaReplaceNumber(na_objs, 3, 3);  /* refers to parent of all */
02368             for (j = 4; j < nobj - 1; j++)
02369                 numaSetValue(na_objs, j, index++);
02370         }
02371         numaaAddNuma(naa_objs, na_objs, L_INSERT);
02372         numaDestroy(&na);
02373     }
02374 
02375         /* Make the Pages object (#3) */
02376     str_pages = generatePagesObjStringPdf(napage);
02377 
02378         /* Build the output */
02379     bad = l_byteaCreate(5000);
02380     na_outlocs = numaCreate(0);  /* locations of all output objects */
02381     for (i = 0; i < npages; i++) {
02382         bas = (L_BYTEA *)ptraGetHandle(pa_data, i);
02383         pdfdata = l_byteaGetData(bas, &size);
02384         na_locs = numaaGetNuma(naa_locs, i, L_CLONE);  /* locs on this page */
02385         na_objs = numaaGetNuma(naa_objs, i, L_CLONE);  /* obj # on this page */
02386         nobj = numaGetCount(na_locs) - 1;
02387         na_sizes = numaMakeDelta(na_locs);  /* object sizes on this page */
02388         sizes = numaGetIArray(na_sizes);
02389         locs = numaGetIArray(na_locs);
02390         if (i == 0) {
02391             l_byteaAppendData(bad, pdfdata, sizes[0]);
02392             l_byteaAppendData(bad, pdfdata + locs[1], sizes[1]);
02393             l_byteaAppendData(bad, pdfdata + locs[2], sizes[2]);
02394             l_byteaAppendString(bad, str_pages);
02395             for (j = 0; j < 4; j++)
02396                 numaAddNumber(na_outlocs, locs[j]);
02397         }
02398         for (j = 4; j < nobj; j++) {
02399             numaAddNumber(na_outlocs, l_byteaGetSize(bad));
02400             bat1 = l_byteaInitFromMem(pdfdata + locs[j], sizes[j]);
02401             bat2 = substituteObjectNumbers(bat1, na_objs);
02402             data = l_byteaGetData(bat2, &size);
02403             l_byteaAppendData(bad, data, size);
02404             l_byteaDestroy(&bat1);
02405             l_byteaDestroy(&bat2);
02406         }
02407         if (i == npages - 1)  /* last one */
02408             numaAddNumber(na_outlocs, l_byteaGetSize(bad));
02409         FREE(sizes);
02410         FREE(locs);
02411         numaDestroy(&na_locs);
02412         numaDestroy(&na_objs);
02413         numaDestroy(&na_sizes);
02414     }
02415 
02416         /* Add the trailer */
02417     str_trailer = makeTrailerStringPdf(na_outlocs);
02418     l_byteaAppendString(bad, str_trailer);
02419 
02420         /* Transfer the output data */
02421     *pdata = l_byteaCopyData(bad, pnbytes);
02422     l_byteaDestroy(&bad);
02423 
02424 #if  DEBUG_MULTIPAGE
02425     fprintf(stderr, "******** object mapper **********");
02426     numaaWriteStream(stderr, naa_objs);
02427 
02428     fprintf(stderr, "******** Page object numbers ***********");
02429     numaWriteStream(stderr, napage);
02430 
02431     fprintf(stderr, "******** Pages object ***********\n");
02432     fprintf(stderr, "%s\n", str_pages);
02433 #endif  /* DEBUG_MULTIPAGE */
02434 
02435     numaaDestroy(&naa_locs);
02436     numaaDestroy(&naa_objs);
02437     numaDestroy(&napage);
02438     numaDestroy(&na_outlocs);
02439     FREE(str_pages);
02440     FREE(str_trailer);
02441     return 0;
02442 }
02443 
02444 
02445 /*---------------------------------------------------------------------*
02446  *       Helper functions for generating the multi-page pdf output      *
02447  *---------------------------------------------------------------------*/
02448 /*!
02449  *  parseTrailerPdf()
02450  *
02451  *  Input:  bas (lba of a pdf file)
02452  *          na (<return> byte locations of the beginning of each object)
02453  *  Return: 0 if OK, 1 on error
02454  */
02455 static l_int32
02456 parseTrailerPdf(L_BYTEA  *bas,
02457                 NUMA    **pna)
02458 {
02459 char     *str;
02460 l_uint8   nl = '\n';
02461 l_uint8  *data;
02462 l_int32   i, j, start, startloc, xrefloc, found, loc, nobj, objno, trailer_ok;
02463 size_t    size;
02464 NUMA     *na, *naobj, *naxref;
02465 SARRAY   *sa;
02466 
02467     PROCNAME("parseTrailerPdf");
02468 
02469     if (!pna)
02470         return ERROR_INT("&na not defined", procName, 1);
02471     *pna = NULL;
02472     if (!bas)
02473         return ERROR_INT("bas not defined", procName, 1);
02474     data = l_byteaGetData(bas, &size);
02475     if (strncmp((char *)data, "%PDF-1.", 7) != 0)
02476         return ERROR_INT("PDF header signature not found", procName, 1);
02477 
02478         /* Search for "startxref" starting 50 bytes from the EOF */
02479     start = 0;
02480     if (size > 50)
02481         start = size - 50;
02482     arrayFindSequence(data + start, size - start,
02483                       (l_uint8 *)"startxref\n", 10, &loc, &found);
02484     if (!found)
02485         return ERROR_INT("startxref not found!", procName, 1);
02486     if (sscanf((char *)(data + start + loc + 10), "%d\n", &xrefloc) != 1)
02487         return ERROR_INT("xrefloc not found!", procName, 1);
02488     if (xrefloc < 0 || xrefloc >= size)
02489         return ERROR_INT("invalid xrefloc!", procName, 1);
02490     sa = sarrayCreateLinesFromString((char *)(data + xrefloc), 0);
02491     str = sarrayGetString(sa, 1, L_NOCOPY);
02492     if ((sscanf(str, "0 %d", &nobj)) != 1)
02493         return ERROR_INT("nobj not found", procName, 1);
02494 
02495         /* Get starting locations.  The numa index is the
02496          * object number.  loc[0] is the ID; loc[nobj + 1] is xrefloc.  */
02497     na = numaCreate(nobj + 1);
02498     *pna = na;
02499     for (i = 0; i < nobj; i++) {
02500         str = sarrayGetString(sa, i + 2, L_NOCOPY);
02501         sscanf(str, "%d", &startloc);
02502         numaAddNumber(na, startloc);
02503     }
02504     numaAddNumber(na, xrefloc);
02505 
02506 #if  DEBUG_MULTIPAGE
02507     fprintf(stderr, "************** Trailer string ************\n");
02508     fprintf(stderr, "xrefloc = %d", xrefloc);
02509     sarrayWriteStream(stderr, sa);
02510 
02511     fprintf(stderr, "************** Object locations ************");
02512     numaWriteStream(stderr, na);
02513 #endif  /* DEBUG_MULTIPAGE */
02514     sarrayDestroy(&sa);
02515 
02516         /* Verify correct parsing */
02517     trailer_ok = TRUE;
02518     for (i = 1; i < nobj; i++) {
02519         numaGetIValue(na, i, &startloc);
02520         if ((sscanf((char *)(data + startloc), "%d 0 obj", &objno)) != 1) {
02521             L_ERROR_INT("bad trailer for object %d", procName, i);
02522             trailer_ok = FALSE;
02523             break;
02524         }
02525     }
02526 
02527         /* If the trailer is broken, reconstruct the correct obj locations */
02528     if (!trailer_ok) {
02529         L_INFO("rebuilding pdf trailer", procName);
02530         numaEmpty(na);
02531         numaAddNumber(na, 0);
02532         l_byteaFindEachSequence(bas, (l_uint8 *)" 0 obj\n", 7, &naobj);
02533         nobj = numaGetCount(naobj);
02534         for (i = 0; i < nobj; i++) {
02535             numaGetIValue(naobj, i, &loc);
02536             for (j = loc - 1; j > 0; j--) {
02537                 if (data[j] == nl)
02538                     break;
02539             }
02540             numaAddNumber(na, j + 1);
02541         }
02542         l_byteaFindEachSequence(bas, (l_uint8 *)"xref", 4, &naxref);
02543         numaGetIValue(naxref, 0, &loc);
02544         numaAddNumber(na, loc);
02545         numaDestroy(&naobj);
02546         numaDestroy(&naxref);
02547     }
02548 
02549     return 0;
02550 }
02551 
02552 
02553 static char *
02554 generatePagesObjStringPdf(NUMA  *napage)
02555 {
02556 char    *str, *outstr;
02557 char    *buf;
02558 l_int32  i, n, index, bufsize;
02559 SARRAY  *sa;
02560 
02561     PROCNAME("generatePagesObjStringPdf");
02562 
02563     if (!napage)
02564         return (char *)ERROR_PTR("napage not defined", procName, NULL);
02565 
02566     n = numaGetCount(napage);
02567     bufsize = 100 + 16 * n;
02568     buf = (char *)CALLOC(bufsize, sizeof(char));
02569     sa = sarrayCreate(n);
02570     for (i = 0; i < n; i++) {
02571         numaGetIValue(napage, i, &index);
02572         snprintf(buf, bufsize, " %d 0 R ", index);
02573         sarrayAddString(sa, buf, L_COPY);
02574     }
02575 
02576     str = sarrayToString(sa, 0);
02577     snprintf(buf, bufsize, "3 0 obj\n"
02578                            "<<\n"
02579                            "/Type /Pages\n"
02580                            "/Kids [%s]\n"
02581                            "/Count %d\n"
02582                            ">>\n", str, n);
02583     outstr = stringNew(buf);
02584     sarrayDestroy(&sa);
02585     FREE(str);
02586     FREE(buf);
02587     return outstr;
02588 }
02589 
02590 
02591 /*!
02592  *  substituteObjectNumbers()
02593  *
02594  *  Input:  bas (lba of a pdf object)
02595  *          na_objs (object number mapping array)
02596  *  Return: bad (lba of rewritten pdf for the object)
02597  *
02598  *  Notes:
02599  *      (1) Interpret the first set of bytes as the object number,
02600  *          map to the new number, and write it out.
02601  *      (2) Find all occurrences of this 4-byte sequence: " 0 R"
02602  *      (3) Find the location and value of the integer preceeding this,
02603  *          and map it to the new value.
02604  *      (4) Rewrite the object with new object numbers.
02605  */
02606 static L_BYTEA *
02607 substituteObjectNumbers(L_BYTEA  *bas,
02608                         NUMA     *na_objs)
02609 {
02610 l_uint8   space = ' ';
02611 l_uint8  *datas;
02612 l_uint8   buf[32];  /* only needs to hold one integer in ascii format */
02613 l_int32   start, nrepl, i, j, objin, objout;
02614 l_int32  *objs, *matches;
02615 size_t    size;
02616 L_BYTEA  *bad;
02617 NUMA     *na_match;
02618 
02619     datas = l_byteaGetData(bas, &size);
02620     bad = l_byteaCreate(100);
02621     objs = numaGetIArray(na_objs);  /* object number mapper */
02622 
02623         /* Substitute the object number on the first line */
02624     sscanf((char *)datas, "%d", &objin);
02625     objout = objs[objin];
02626     snprintf((char *)buf, 32, "%d", objout);
02627     l_byteaAppendString(bad, (char *)buf);
02628 
02629         /* Find the set of matching locations for object references */
02630     arrayFindSequence(datas, size, &space, 1, &start, NULL);
02631     na_match = arrayFindEachSequence(datas, size, (l_uint8 *)" 0 R", 4);
02632     if (!na_match) {
02633         l_byteaAppendData(bad, datas + start, size - start);
02634         FREE(objs);
02635         return bad;
02636     }
02637 
02638         /* Substitute all the object reference numbers */
02639     nrepl = numaGetCount(na_match);
02640     matches = numaGetIArray(na_match);
02641     for (i = 0; i < nrepl; i++) {
02642             /* Find the first space before the object number */
02643         for (j = matches[i] - 1; j > 0; j--) {
02644             if (datas[j] == space)
02645                 break;
02646         }
02647             /* Copy bytes from 'start' up to the object number */
02648         l_byteaAppendData(bad, datas + start, j - start + 1);
02649         sscanf((char *)(datas + j + 1), "%d", &objin);
02650         objout = objs[objin];
02651         snprintf((char *)buf, 32, "%d", objout);
02652         l_byteaAppendString(bad, (char *)buf);
02653         start = matches[i];
02654     }
02655     l_byteaAppendData(bad, datas + start, size - start);
02656 
02657     FREE(objs);
02658     FREE(matches);
02659     numaDestroy(&na_match);
02660     return bad;
02661 }
02662 
02663 
02664 /*---------------------------------------------------------------------*
02665  *                     Create/destroy/access pdf data                  *
02666  *---------------------------------------------------------------------*/
02667 static L_PDF_DATA *
02668 pdfdataCreate(const char  *title)
02669 {
02670 L_PDF_DATA *lpd;
02671 
02672     lpd = (L_PDF_DATA *)CALLOC(1, sizeof(L_PDF_DATA));
02673     if (title) lpd->title = stringNew(title);
02674     lpd->cida = ptraCreate(10);
02675     lpd->xy = ptaCreate(10);
02676     lpd->wh = ptaCreate(10);
02677     lpd->saprex = sarrayCreate(10);
02678     lpd->sacmap = sarrayCreate(10);
02679     lpd->objsize = numaCreate(20);
02680     lpd->objloc = numaCreate(20);
02681     return lpd;
02682 }
02683 
02684 static void
02685 pdfdataDestroy(L_PDF_DATA  **plpd)
02686 {
02687 l_int32             i;
02688 L_COMPRESSED_DATA  *cid;
02689 L_PDF_DATA         *lpd;
02690 
02691     PROCNAME("pdfdataDestroy");
02692 
02693     if (plpd== NULL) {
02694         L_WARNING("ptr address is null!", procName);
02695         return;
02696     }
02697     if ((lpd = *plpd) == NULL)
02698         return;
02699 
02700     if (lpd->title) FREE(lpd->title);
02701     for (i = 0; i < lpd->n; i++) {
02702         cid = (L_COMPRESSED_DATA *)ptraRemove(lpd->cida, i, L_NO_COMPACTION);
02703         compressed_dataDestroy(&cid);
02704     }
02705 
02706     ptraDestroy(&lpd->cida, 0, 0);
02707     if (lpd->id) FREE(lpd->id);
02708     if (lpd->obj1) FREE(lpd->obj1);
02709     if (lpd->obj2) FREE(lpd->obj2);
02710     if (lpd->obj3) FREE(lpd->obj3);
02711     if (lpd->obj4) FREE(lpd->obj4);
02712     if (lpd->obj5) FREE(lpd->obj5);
02713     if (lpd->poststream) FREE(lpd->poststream);
02714     if (lpd->trailer) FREE(lpd->trailer);
02715     if (lpd->xy) ptaDestroy(&lpd->xy);
02716     if (lpd->wh) ptaDestroy(&lpd->wh);
02717     if (lpd->mediabox) boxDestroy(&lpd->mediabox);
02718     if (lpd->saprex) sarrayDestroy(&lpd->saprex);
02719     if (lpd->sacmap) sarrayDestroy(&lpd->sacmap);
02720     if (lpd->objsize) numaDestroy(&lpd->objsize);
02721     if (lpd->objloc) numaDestroy(&lpd->objloc);
02722     FREE(lpd);
02723     *plpd = NULL;
02724     return;
02725 }
02726 
02727 
02728 static L_COMPRESSED_DATA *
02729 pdfdataGetCid(L_PDF_DATA  *lpd,
02730               l_int32      index)
02731 {
02732     PROCNAME("pdfdataGetCid");
02733 
02734     if (!lpd)
02735         return (L_COMPRESSED_DATA *)ERROR_PTR("lpd not defined",
02736                                               procName, NULL);
02737     if (index < 0 || index >= lpd->n)
02738         return (L_COMPRESSED_DATA *)ERROR_PTR("invalid image index",
02739                                               procName, NULL);
02740 
02741     return (L_COMPRESSED_DATA *)ptraGetHandle(lpd->cida, index);
02742 }
02743 
02744 
02745 /*---------------------------------------------------------------------*
02746  *                       Set flags for special modes                   *
02747  *---------------------------------------------------------------------*/
02748 /*!
02749  *  l_pdfSetG4ImageMask()
02750  *
02751  *      Input:  flag (1 for writing g4 data as fg only through a mask;
02752  *                    0 for writing fg and bg)
02753  *      Return: void
02754  *
02755  *  Notes:
02756  *      (1) The default is for writing only the fg (through the mask).
02757  *          That way when you write a 1 bpp image, the bg is transparent,
02758  *          so any previously written image remains visible behind it.
02759  */
02760 void
02761 l_pdfSetG4ImageMask(l_int32  flag)
02762 {
02763     var_WRITE_G4_IMAGE_MASK = flag;
02764 }
02765 
02766 
02767 /*!
02768  *  l_pdfSetDateAndVersion()
02769  *
02770  *      Input:  flag (1 for writing date/time and leptonica version;
02771  *                    0 for omitting this from the metadata)
02772  *      Return: void
02773  *
02774  *  Notes:
02775  *      (1) The default is for writing this data.  For regression tests
02776  *          that compare output against golden files, it is useful to omit.
02777  */
02778 void
02779 l_pdfSetDateAndVersion(l_int32  flag)
02780 {
02781     var_WRITE_DATE_AND_VERSION = flag;
02782 }
02783 
02784 
02785 /* --------------------------------------------*/
02786 #endif  /* USE_PDFIO */
02787 /* --------------------------------------------*/
02788 
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Defines