Leptonica 1.68
C Image Processing Library
|
00001 /*====================================================================* 00002 - Copyright (C) 2001 Leptonica. All rights reserved. 00003 - This software is distributed in the hope that it will be 00004 - useful, but with NO WARRANTY OF ANY KIND. 00005 - No author or distributor accepts responsibility to anyone for the 00006 - consequences of using this software, or for whether it serves any 00007 - particular purpose or works at all, unless he or she says so in 00008 - writing. Everyone is granted permission to copy, modify and 00009 - redistribute this source code, for commercial or non-commercial 00010 - purposes, with the following restrictions: (1) the origin of this 00011 - source code must not be misrepresented; (2) modified versions must 00012 - be plainly marked as such; and (3) this notice may not be removed 00013 - or altered from any source or modified source distribution. 00014 *====================================================================*/ 00015 00016 /* 00017 * pdfio.c 00018 * 00019 * |=============================================================| 00020 * | Important note | 00021 * |=============================================================| 00022 * | Some of these functions require libtiff, libjpeg, and libz | 00023 * | If you do not have these libraries, you must set | 00024 * | #define USE_PDFIO 0 | 00025 * | in environ.h. This will link pdfiostub.c | 00026 * |=============================================================| 00027 * 00028 * The first set of functions converts a set of images to a multi-page 00029 * pdf file, with one image on each page. All images are rendered 00030 * at the same (input) resolution. The images can be specified as 00031 * being in a directory, or they can be in an sarray. The output 00032 * pdf can be either a file or an array of bytes in memory. 00033 * 00034 * The second set of functions implements a pdf output "device driver" 00035 * for wrapping (encoding) any number of images on a single page 00036 * in pdf. The images can be rendered using a pdf viewer, 00037 * such as gv, evince, xpdf or acroread. 00038 * See: http://www.adobe.com/devnet/pdf/pdf_reference_archive.html 00039 * 00040 * The third set of functions (segmented) takes an image, an 00041 * optional binary mask, an encoding flag, and some other parameters, 00042 * and generates a single-page mixed raster pdf. 00043 * 00044 * The fourth set of functions (concatenated) takes a set of single-page 00045 * pdf files and concatenates them into a multi-page pdf 00046 * 00047 * 1. Convert specified image files to Pdf (one image file per page) 00048 * l_int32 convertFilesToPdf() 00049 * l_int32 saConvertFilesToPdf() 00050 * l_int32 saConvertFilesToPdfData() 00051 * l_int32 selectDefaultPdfEncoding() 00052 * 00053 * 2. Single page, multi-image converters 00054 * l_int32 convertToPdf() 00055 * l_int32 convertImageDataToPdf() 00056 * l_int32 convertToPdfData() 00057 * l_int32 convertImageDataToPdfData() 00058 * l_int32 pixConvertToPdf() 00059 * l_int32 pixConvertToPdfData() 00060 * l_int32 pixWriteStreamPdf() 00061 * 00062 * 3. Segmented multi-page, multi-image converter 00063 * l_int32 convertSegmentedFilesToPdf() 00064 * 00065 * 4. Segmented single page, multi-image converters 00066 * l_int32 convertToPdfSegmented() 00067 * l_int32 pixConvertToPdfSegmented() 00068 * l_int32 convertToPdfDataSegmented() 00069 * l_int32 pixConvertToPdfDataSegmented() 00070 * 00071 * Helper functions for generating the output pdf string 00072 * static l_int32 l_generatePdf() 00073 * static void generateFixedStringsPdf() 00074 * static void generateMediaboxPdf() 00075 * static l_int32 generatePageStringPdf() 00076 * static l_int32 generateContentStringPdf() 00077 * static l_int32 generatePreXStringsPdf() 00078 * static l_int32 generateColormapStringsPdf() 00079 * static void generateTrailerPdf() 00080 * static l_int32 makeTrailerStringPdf() 00081 * static l_int32 generateOutputDataPdf() 00082 * 00083 * 5. Multi-page concatenation 00084 * l_int32 concatenatePdf() 00085 * l_int32 saConcatenatePdf() 00086 * l_int32 ptraConcatenatePdf() 00087 * l_int32 concatenatePdfToData() 00088 * l_int32 saConcatenatePdfToData() 00089 * l_int32 ptraConcatenatePdfToData() 00090 * 00091 * Helper functions for generating the multi-page pdf output 00092 * static l_int32 parseTrailerPdf() 00093 * static char *generatePagesObjStringPdf() 00094 * static L_BYTEA *substituteObjectNumbers() 00095 * 00096 * Create/destroy/access pdf data 00097 * static L_PDF_DATA *pdfdataCreate() 00098 * static void pdfdataDestroy() 00099 * static L_COMPRESSED_DATA *pdfdataGetCid() 00100 * 00101 * Set flags for special modes 00102 * void l_pdfSetG4ImageMask() 00103 * void l_pdfSetDateAndVersion() 00104 * 00105 * The top-level multi-image functions can be visualized as follows: 00106 * Output pdf data to file: 00107 * convertToPdf() and convertImageDataToPdf() 00108 * --> pixConvertToPdf() 00109 * --> pixConvertToPdfData() 00110 * 00111 * Output pdf data to array in memory: 00112 * convertToPdfData() and convertImageDataToPdfData() 00113 * --> pixConvertToPdfData() 00114 * 00115 * The top-level segmented image functions can be visualized as follows: 00116 * Output pdf data to file: 00117 * convertToPdfSegmented() 00118 * --> pixConvertToPdfSegmented() 00119 * --> pixConvertToPdfDataSegmented() 00120 * 00121 * Output pdf data to array in memory: 00122 * convertToPdfDataSegmented() 00123 * --> pixConvertToPdfDataSegmented() 00124 * 00125 * For multi-page concatenation, there are three different types of input 00126 * (1) directory and optional filename filter 00127 * (2) sarray of filenames 00128 * (3) ptra of byte arrays of pdf data 00129 * and two types of output for the concatenated pdf data 00130 * (1) filename 00131 * (2) data array and size 00132 * High-level interfaces are given for each of the six combinations. 00133 */ 00134 00135 #include <string.h> 00136 #include <math.h> 00137 #include "allheaders.h" 00138 00139 /* --------------------------------------------*/ 00140 #if USE_PDFIO /* defined in environ.h */ 00141 /* --------------------------------------------*/ 00142 00143 /* Typical scan resolution in ppi (pixels/inch) */ 00144 static const l_int32 DEFAULT_INPUT_RES = 300; 00145 00146 /* Static helpers */ 00147 static l_int32 l_generatePdf(l_uint8 **pdata, size_t *pnbytes, 00148 L_PDF_DATA *lpd); 00149 static void generateFixedStringsPdf(L_PDF_DATA *lpd); 00150 static void generateMediaboxPdf(L_PDF_DATA *lpd); 00151 static l_int32 generatePageStringPdf(L_PDF_DATA *lpd); 00152 static l_int32 generateContentStringPdf(L_PDF_DATA *lpd); 00153 static l_int32 generatePreXStringsPdf(L_PDF_DATA *lpd); 00154 static l_int32 generateColormapStringsPdf(L_PDF_DATA *lpd); 00155 static void generateTrailerPdf(L_PDF_DATA *lpd); 00156 static char *makeTrailerStringPdf(NUMA *naloc); 00157 static l_int32 generateOutputDataPdf(l_uint8 **pdata, size_t *pnbytes, 00158 L_PDF_DATA *lpd); 00159 00160 static l_int32 parseTrailerPdf(L_BYTEA *bas, NUMA **pna); 00161 static char *generatePagesObjStringPdf(NUMA *napage); 00162 static L_BYTEA *substituteObjectNumbers(L_BYTEA *bas, NUMA *na_objs); 00163 00164 static L_PDF_DATA *pdfdataCreate(const char *title); 00165 static void pdfdataDestroy(L_PDF_DATA **plpd); 00166 static L_COMPRESSED_DATA *pdfdataGetCid(L_PDF_DATA *lpd, l_int32 index); 00167 00168 00169 /* ---------------- Defaults for rendering options ----------------- */ 00170 /* Output G4 as writing through image mask; this is the default */ 00171 static l_int32 var_WRITE_G4_IMAGE_MASK = 1; 00172 /* Write date/time and lib version into pdf; this is the default */ 00173 static l_int32 var_WRITE_DATE_AND_VERSION = 1; 00174 00175 #define L_SMALLBUF 256 00176 #define L_BIGBUF 2048 /* must be able to hold hex colormap */ 00177 00178 00179 #ifndef NO_CONSOLE_IO 00180 #define DEBUG_MULTIPAGE 0 00181 #endif /* ~NO_CONSOLE_IO */ 00182 00183 00184 /*---------------------------------------------------------------------* 00185 * Convert specified image files to Pdf (one image file per page) * 00186 *---------------------------------------------------------------------*/ 00187 /*! 00188 * convertFilesToPdf() 00189 * 00190 * Input: directory name (containing images) 00191 * substr (<optional> substring filter on filenames; can be NULL) 00192 * res (input resolution of all images) 00193 * scalefactor (scaling factor applied to each image) 00194 * quality (used for JPEG only; 0 for default (75)) 00195 * title (<optional> pdf title; if null, taken from the first 00196 * image filename) 00197 * fileout (pdf file of all images) 00198 * Return: 0 if OK, 1 on error 00199 * 00200 * Notes: 00201 * (1) If @substr is not NULL, only image filenames that contain 00202 * the substring can be used. If @substr == NULL, all files 00203 * in the directory are used. 00204 * (2) The files in the directory, after optional filtering by 00205 * the substring, are lexically sorted in increasing order 00206 * before concatenation. 00207 * (3) The images are encoded with G4 if 1 bpp; JPEG if 8 bpp without 00208 * colormap and many colors, or 32 bpp; FLATE for anything else. 00209 */ 00210 l_int32 00211 convertFilesToPdf(const char *dirname, 00212 const char *substr, 00213 l_int32 res, 00214 l_float32 scalefactor, 00215 l_int32 quality, 00216 const char *title, 00217 const char *fileout) 00218 { 00219 l_int32 ret; 00220 SARRAY *sa; 00221 00222 PROCNAME("convertFilesToPdf"); 00223 00224 if (!dirname) 00225 return ERROR_INT("dirname not defined", procName, 1); 00226 if (!fileout) 00227 return ERROR_INT("fileout not defined", procName, 1); 00228 00229 if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL) 00230 return ERROR_INT("sa not made", procName, 1); 00231 ret = saConvertFilesToPdf(sa, res, scalefactor, quality, title, fileout); 00232 sarrayDestroy(&sa); 00233 return ret; 00234 } 00235 00236 00237 /*! 00238 * saConvertFilesToPdf() 00239 * 00240 * Input: sarray (of pathnames for images) 00241 * res (input resolution of all images) 00242 * scalefactor (scaling factor applied to each image) 00243 * quality (used for JPEG only; 0 for default (75)) 00244 * title (<optional> pdf title; if null, taken from the first 00245 * image filename) 00246 * fileout (pdf file of all images) 00247 * Return: 0 if OK, 1 on error 00248 * 00249 * Notes: 00250 * (1) The images are encoded with G4 if 1 bpp; JPEG if 8 bpp without 00251 * colormap and many colors, or 32 bpp; FLATE for anything else. 00252 */ 00253 l_int32 00254 saConvertFilesToPdf(SARRAY *sa, 00255 l_int32 res, 00256 l_float32 scalefactor, 00257 l_int32 quality, 00258 const char *title, 00259 const char *fileout) 00260 { 00261 l_uint8 *data; 00262 l_int32 ret; 00263 size_t nbytes; 00264 00265 PROCNAME("saConvertFilesToPdf"); 00266 00267 if (!sa) 00268 return ERROR_INT("sa not defined", procName, 1); 00269 00270 ret = saConvertFilesToPdfData(sa, res, scalefactor, quality, title, 00271 &data, &nbytes); 00272 if (ret) { 00273 if (data) FREE(data); 00274 return ERROR_INT("pdf data not made", procName, 1); 00275 } 00276 00277 ret = l_binaryWrite(fileout, "w", data, nbytes); 00278 FREE(data); 00279 if (ret) 00280 L_ERROR("pdf data not written to file", procName); 00281 return ret; 00282 } 00283 00284 00285 /*! 00286 * saConvertFilesToPdfData() 00287 * 00288 * Input: sarray (of pathnames for images) 00289 * res (input resolution of all images) 00290 * scalefactor (scaling factor applied to each image) 00291 * quality (used for JPEG only; 0 for default (75)) 00292 * title (<optional> pdf title; if null, taken from the first 00293 * image filename) 00294 * &data (<return> output pdf data (of all images) 00295 * &nbytes (<return> size of output pdf data) 00296 * Return: 0 if OK, 1 on error 00297 * 00298 * Notes: 00299 * (1) The images are encoded with G4 if 1 bpp; JPEG if 8 bpp without 00300 * colormap and many colors, or 32 bpp; FLATE for anything else. 00301 */ 00302 l_int32 00303 saConvertFilesToPdfData(SARRAY *sa, 00304 l_int32 res, 00305 l_float32 scalefactor, 00306 l_int32 quality, 00307 const char *title, 00308 l_uint8 **pdata, 00309 size_t *pnbytes) 00310 { 00311 char *fname; 00312 l_uint8 *imdata; 00313 l_int32 i, n, ret, type, npages, scaledres; 00314 size_t imbytes; 00315 L_BYTEA *ba; 00316 PIX *pixs, *pix; 00317 L_PTRA *pa_data; 00318 00319 PROCNAME("saConvertFilesToPdfData"); 00320 00321 if (!sa) 00322 return ERROR_INT("sa not defined", procName, 1); 00323 if (scalefactor <= 0.0) scalefactor = 1.0; 00324 00325 /* Generate all the encoded pdf strings */ 00326 n = sarrayGetCount(sa); 00327 pa_data = ptraCreate(n); 00328 for (i = 0; i < n; i++) { 00329 fname = sarrayGetString(sa, i, L_NOCOPY); 00330 if ((pixs = pixRead(fname)) == NULL) { 00331 L_ERROR_STRING("image not readable from file %s", procName, fname); 00332 continue; 00333 } 00334 if (scalefactor != 1.0) 00335 pix = pixScale(pixs, scalefactor, scalefactor); 00336 else 00337 pix = pixClone(pixs); 00338 scaledres = (l_int32)(res * scalefactor); 00339 if (selectDefaultPdfEncoding(pix, &type)) { 00340 L_ERROR_STRING("encoding type selection failed for file %s", 00341 procName, fname); 00342 pixDestroy(&pix); 00343 continue; 00344 } 00345 ret = pixConvertToPdfData(pix, type, quality, &imdata, &imbytes, 00346 0, 0, scaledres, NULL, 0, title); 00347 pixDestroy(&pix); 00348 pixDestroy(&pixs); 00349 if (ret) { 00350 L_ERROR_STRING("pdf encoding failed for %s", procName, fname); 00351 continue; 00352 } 00353 ba = l_byteaInitFromMem(imdata, imbytes); 00354 if (imdata) FREE(imdata); 00355 ptraAdd(pa_data, ba); 00356 } 00357 ptraGetActualCount(pa_data, &npages); 00358 if (npages == 0) { 00359 L_ERROR("no pdf files made", procName); 00360 ptraDestroy(&pa_data, FALSE, FALSE); 00361 return 1; 00362 } 00363 00364 /* Concatenate them */ 00365 ret = ptraConcatenatePdfToData(pa_data, NULL, pdata, pnbytes); 00366 00367 ptraGetActualCount(pa_data, &npages); /* recalculate in case it changes */ 00368 for (i = 0; i < npages; i++) { 00369 ba = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION); 00370 l_byteaDestroy(&ba); 00371 } 00372 ptraDestroy(&pa_data, FALSE, FALSE); 00373 return ret; 00374 } 00375 00376 00377 /*! 00378 * selectDefaultPdfEncoding() 00379 * 00380 * Input: pix 00381 * &type (<return> L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE) 00382 * 00383 * Notes: 00384 * (1) This attempts to choose an encoding for the pix that results 00385 * in the smallest file, assuming that if jpeg encoded, it will 00386 * use quality = 75. The decision is approximate, in that 00387 * (a) all colormapped images will be losslessly encoded with 00388 * gzip (flate), and (b) an image with less than about 20 colors 00389 * is likely to be smaller if flate encoded than if encoded 00390 * as a jpeg (dct). For example, an image made by pixScaleToGray3() 00391 * will have 10 colors, and flate encoding will give about 00392 * twice the compression as jpeg with quality = 75. 00393 */ 00394 l_int32 00395 selectDefaultPdfEncoding(PIX *pix, 00396 l_int32 *ptype) 00397 { 00398 l_int32 w, h, d, factor, ncolors; 00399 PIXCMAP *cmap; 00400 00401 PROCNAME("selectDefaultPdfEncoding"); 00402 00403 if (!pix) 00404 return ERROR_INT("pix not defined", procName, 1); 00405 if (!ptype) 00406 return ERROR_INT("&type not defined", procName, 1); 00407 *ptype = L_FLATE_ENCODE; /* default universal encoding */ 00408 pixGetDimensions(pix, &w, &h, &d); 00409 cmap = pixGetColormap(pix); 00410 if (d == 8 && !cmap) { 00411 factor = L_MAX(1, (l_int32)sqrt((l_float64)(w * h) / 20000.)); 00412 pixNumColors(pix, factor, &ncolors); 00413 if (ncolors < 20) 00414 *ptype = L_FLATE_ENCODE; 00415 else 00416 *ptype = L_JPEG_ENCODE; 00417 } 00418 else if (d == 1) 00419 *ptype = L_G4_ENCODE; 00420 else if (cmap || d == 2 || d == 4) 00421 *ptype = L_FLATE_ENCODE; 00422 else if (d == 8 || d == 32) 00423 *ptype = L_JPEG_ENCODE; 00424 else 00425 return ERROR_INT("type selection failure", procName, 1); 00426 00427 return 0; 00428 } 00429 00430 00431 /*---------------------------------------------------------------------* 00432 * Single page, multi-image converters * 00433 *---------------------------------------------------------------------*/ 00434 /*! 00435 * convertToPdf() 00436 * 00437 * Input: filein (input image file -- any format) 00438 * type (L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE) 00439 * quality (used for JPEG only; 0 for default (75)) 00440 * fileout (output pdf file; only required on last image on page) 00441 * x, y (location of lower-left corner of image, in pixels, 00442 * relative to the PostScript origin (0,0) at 00443 * the lower-left corner of the page) 00444 * res (override the resolution of the input image, in ppi; 00445 * use 0 to respect the resolution embedded in the input) 00446 * &lpd (ptr to lpd, which is created on the first invocation 00447 * and returned until last image is processed, at which 00448 * time it is destroyed) 00449 * position (in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE, 00450 * L_LAST_IMAGE) 00451 * title (<optional> pdf title; if null, taken from the first 00452 * image placed on a page; e.g., an input image filename) 00453 * Return: 0 if OK, 1 on error 00454 * 00455 * Notes: 00456 * (1) To wrap only one image in pdf, input @plpd = NULL, and 00457 * the value of @position will be ignored: 00458 * convertToPdf(... type, quality, x, y, res, NULL, 0); 00459 * (2) To wrap multiple images on a single pdf page, this is called 00460 * once for each successive image. Do it this way: 00461 * L_PDF_DATA *lpd; 00462 * convertToPdf(... type, quality, x, y, res, &lpd, L_FIRST_IMAGE); 00463 * convertToPdf(... type, quality, x, y, res, &lpd, L_NEXT_IMAGE); 00464 * ... 00465 * convertToPdf(... type, quality, x, y, res, &lpd, L_LAST_IMAGE); 00466 * This will write the result to the value of @fileout specified 00467 * in the first call; succeeding values of @fileout are ignored. 00468 * On the last call: the pdf data bytes are computed and written 00469 * to @fileout, lpd is destroyed internally, and the returned 00470 * value of lpd is null. So the client has nothing to clean up. 00471 * (3) (a) Set @res == 0 to respect the resolution embedded in the 00472 * image file. If no resolution is embedded, it will be set 00473 * to the default value. 00474 * (b) Set @res to some other value to override the file resolution. 00475 * (4) (a) If the input @res and the resolution of the output device 00476 * are equal, the image will be "displayed" at the same size 00477 * as the original. 00478 * (b) If the input @res is 72, the output device will render 00479 * the image at 1 pt/pixel. 00480 * (c) Some possible choices for the default input pix resolution are: 00481 * 72 ppi Render pix on any output device at one pt/pixel 00482 * 96 ppi Windows default for generated display images 00483 * 300 ppi Typical default for scanned images. 00484 * We choose 300, which is sensible for rendering page images. 00485 * However, images come from a variety of sources, and 00486 * some are explicitly created for viewing on a display. 00487 */ 00488 l_int32 00489 convertToPdf(const char *filein, 00490 l_int32 type, 00491 l_int32 quality, 00492 const char *fileout, 00493 l_int32 x, 00494 l_int32 y, 00495 l_int32 res, 00496 L_PDF_DATA **plpd, 00497 l_int32 position, 00498 const char *title) 00499 { 00500 l_uint8 *data; 00501 l_int32 ret; 00502 size_t nbytes; 00503 00504 PROCNAME("convertToPdf"); 00505 00506 if (!filein) 00507 return ERROR_INT("filein not defined", procName, 1); 00508 if (!plpd || (position == L_LAST_IMAGE)) { 00509 if (!fileout) 00510 return ERROR_INT("fileout not defined", procName, 1); 00511 } 00512 if (type != L_G4_ENCODE && type != L_JPEG_ENCODE && 00513 type != L_FLATE_ENCODE) 00514 return ERROR_INT("invalid conversion type", procName, 1); 00515 00516 if (convertToPdfData(filein, type, quality, &data, &nbytes, x, y, 00517 res, plpd, position, title)) 00518 return ERROR_INT("pdf data not made", procName, 1); 00519 00520 if (!plpd || (position == L_LAST_IMAGE)) { 00521 ret = l_binaryWrite(fileout, "w", data, nbytes); 00522 FREE(data); 00523 if (ret) 00524 return ERROR_INT("pdf data not written to file", procName, 1); 00525 } 00526 00527 return 0; 00528 } 00529 00530 00531 /*! 00532 * convertImageDataToPdf() 00533 * 00534 * Input: imdata (array of formatted image data; e.g., png, jpeg) 00535 * size (size of image data) 00536 * type (L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE) 00537 * quality (used for JPEG only; 0 for default (75)) 00538 * fileout (output pdf file; only required on last image on page) 00539 * x, y (location of lower-left corner of image, in pixels, 00540 * relative to the PostScript origin (0,0) at 00541 * the lower-left corner of the page) 00542 * res (override the resolution of the input image, in ppi; 00543 * use 0 to respect the resolution embedded in the input) 00544 * &lpd (ptr to lpd, which is created on the first invocation 00545 * and returned until last image is processed, at which 00546 * time it is destroyed) 00547 * position (in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE, 00548 * L_LAST_IMAGE) 00549 * title (<optional> pdf title; taken from the first image 00550 * placed on a page; e.g., an input image filename) 00551 * Return: 0 if OK, 1 on error 00552 * 00553 * Notes: 00554 * (1) If @res == 0 and the input resolution field is 0, 00555 * this will use DEFAULT_INPUT_RES. 00556 * (2) See comments in convertToPdf(). 00557 */ 00558 l_int32 00559 convertImageDataToPdf(l_uint8 *imdata, 00560 size_t size, 00561 l_int32 type, 00562 l_int32 quality, 00563 const char *fileout, 00564 l_int32 x, 00565 l_int32 y, 00566 l_int32 res, 00567 L_PDF_DATA **plpd, 00568 l_int32 position, 00569 const char *title) 00570 { 00571 l_int32 ret; 00572 PIX *pix; 00573 00574 PROCNAME("convertImageDataToPdf"); 00575 00576 if (!imdata) 00577 return ERROR_INT("image data not defined", procName, 1); 00578 if (type != L_G4_ENCODE && type != L_JPEG_ENCODE && 00579 type != L_FLATE_ENCODE) 00580 return ERROR_INT("invalid conversion type", procName, 1); 00581 if (!plpd || (position == L_LAST_IMAGE)) { 00582 if (!fileout) 00583 return ERROR_INT("fileout not defined", procName, 1); 00584 } 00585 00586 if ((pix = pixReadMem(imdata, size)) == NULL) 00587 return ERROR_INT("pix not read", procName, 1); 00588 ret = pixConvertToPdf(pix, type, quality, fileout, x, y, res, 00589 plpd, position, title); 00590 pixDestroy(&pix); 00591 return ret; 00592 } 00593 00594 00595 /*! 00596 * convertToPdfData() 00597 * 00598 * Input: filein (input image file -- any format) 00599 * type (L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE) 00600 * quality (used for JPEG only; 0 for default (75)) 00601 * &data (<return> pdf data in memory) 00602 * &nbytes (<return> number of bytes in pdf data) 00603 * x, y (location of lower-left corner of image, in pixels, 00604 * relative to the PostScript origin (0,0) at 00605 * the lower-left corner of the page) 00606 * res (override the resolution of the input image, in ppi; 00607 * use 0 to respect the resolution embedded in the input) 00608 * &lpd (ptr to lpd, which is created on the first invocation 00609 * and returned until last image is processed, at which 00610 * time it is destroyed) 00611 * position (in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE, 00612 * L_LAST_IMAGE) 00613 * title (<optional> pdf title; taken from the first image 00614 * placed on a page; e.g., an input image filename) 00615 * Return: 0 if OK, 1 on error 00616 * 00617 * Notes: 00618 * (1) If @res == 0 and the input resolution field is 0, 00619 * this will use DEFAULT_INPUT_RES. 00620 * (2) See comments in convertToPdf(). 00621 */ 00622 l_int32 00623 convertToPdfData(const char *filein, 00624 l_int32 type, 00625 l_int32 quality, 00626 l_uint8 **pdata, 00627 size_t *pnbytes, 00628 l_int32 x, 00629 l_int32 y, 00630 l_int32 res, 00631 L_PDF_DATA **plpd, 00632 l_int32 position, 00633 const char *title) 00634 { 00635 PIX *pix; 00636 00637 PROCNAME("convertToPdfData"); 00638 00639 if (!pdata) 00640 return ERROR_INT("&data not defined", procName, 1); 00641 *pdata = NULL; 00642 if (!pnbytes) 00643 return ERROR_INT("&nbytes not defined", procName, 1); 00644 *pnbytes = 0; 00645 if (!filein) 00646 return ERROR_INT("filein not defined", procName, 1); 00647 if (type != L_G4_ENCODE && type != L_JPEG_ENCODE && 00648 type != L_FLATE_ENCODE) 00649 return ERROR_INT("invalid conversion type", procName, 1); 00650 00651 if ((pix = pixRead(filein)) == NULL) 00652 return ERROR_INT("pix not made", procName, 1); 00653 00654 pixConvertToPdfData(pix, type, quality, pdata, pnbytes, 00655 x, y, res, plpd, position, title); 00656 pixDestroy(&pix); 00657 return 0; 00658 } 00659 00660 00661 /*! 00662 * convertImageDataToPdfData() 00663 * 00664 * Input: imdata (array of formatted image data; e.g., png, jpeg) 00665 * size (size of image data) 00666 * type (L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE) 00667 * quality (used for JPEG only; 0 for default (75)) 00668 * &data (<return> pdf data in memory) 00669 * &nbytes (<return> number of bytes in pdf data) 00670 * x, y (location of lower-left corner of image, in pixels, 00671 * relative to the PostScript origin (0,0) at 00672 * the lower-left corner of the page) 00673 * res (override the resolution of the input image, in ppi; 00674 * use 0 to respect the resolution embedded in the input) 00675 * &lpd (ptr to lpd, which is created on the first invocation 00676 * and returned until last image is processed, at which 00677 * time it is destroyed) 00678 * position (in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE, 00679 * L_LAST_IMAGE) 00680 * title (<optional> pdf title; taken from the first image 00681 * placed on a page; e.g., an input image filename) 00682 * Return: 0 if OK, 1 on error 00683 * 00684 * Notes: 00685 * (1) If @res == 0 and the input resolution field is 0, 00686 * this will use DEFAULT_INPUT_RES. 00687 * (2) See comments in convertToPdf(). 00688 */ 00689 l_int32 00690 convertImageDataToPdfData(l_uint8 *imdata, 00691 size_t size, 00692 l_int32 type, 00693 l_int32 quality, 00694 l_uint8 **pdata, 00695 size_t *pnbytes, 00696 l_int32 x, 00697 l_int32 y, 00698 l_int32 res, 00699 L_PDF_DATA **plpd, 00700 l_int32 position, 00701 const char *title) 00702 { 00703 l_int32 ret; 00704 PIX *pix; 00705 00706 PROCNAME("convertImageDataToPdfData"); 00707 00708 if (!imdata) 00709 return ERROR_INT("image data not defined", procName, 1); 00710 if (!pdata) 00711 return ERROR_INT("&data not defined", procName, 1); 00712 *pdata = NULL; 00713 if (!pnbytes) 00714 return ERROR_INT("&nbytes not defined", procName, 1); 00715 *pnbytes = 0; 00716 if (plpd) { /* part of multi-page invocation */ 00717 if (position == L_FIRST_IMAGE) 00718 *plpd = NULL; 00719 } 00720 00721 if ((pix = pixReadMem(imdata, size)) == NULL) 00722 return ERROR_INT("pix not read", procName, 1); 00723 ret = pixConvertToPdfData(pix, type, quality, pdata, pnbytes, 00724 x, y, res, plpd, position, title); 00725 pixDestroy(&pix); 00726 return ret; 00727 } 00728 00729 00730 /*! 00731 * pixConvertToPdf() 00732 * 00733 * Input: pix 00734 * type (L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE) 00735 * quality (used for JPEG only; 0 for default (75)) 00736 * fileout (output pdf file; only required on last image on page) 00737 * x, y (location of lower-left corner of image, in pixels, 00738 * relative to the PostScript origin (0,0) at 00739 * the lower-left corner of the page) 00740 * res (override the resolution of the input image, in ppi; 00741 * use 0 to respect the resolution embedded in the input) 00742 * &lpd (ptr to lpd, which is created on the first invocation 00743 * and returned until last image is processed) 00744 * position (in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE, 00745 * L_LAST_IMAGE) 00746 * title (<optional> pdf title; taken from the first image 00747 * placed on a page; e.g., an input image filename) 00748 * Return: 0 if OK, 1 on error 00749 * 00750 * Notes: 00751 * (1) If @res == 0 and the input resolution field is 0, 00752 * this will use DEFAULT_INPUT_RES. 00753 * (2) This only writes data to fileout if it is the last 00754 * image to be written on the page. 00755 * (3) See comments in convertToPdf(). 00756 */ 00757 l_int32 00758 pixConvertToPdf(PIX *pix, 00759 l_int32 type, 00760 l_int32 quality, 00761 const char *fileout, 00762 l_int32 x, 00763 l_int32 y, 00764 l_int32 res, 00765 L_PDF_DATA **plpd, 00766 l_int32 position, 00767 const char *title) 00768 { 00769 l_uint8 *data; 00770 l_int32 ret; 00771 size_t nbytes; 00772 00773 PROCNAME("pixConvertToPdf"); 00774 00775 if (!pix) 00776 return ERROR_INT("pix not defined", procName, 1); 00777 if (type != L_G4_ENCODE && type != L_JPEG_ENCODE && 00778 type != L_FLATE_ENCODE) 00779 return ERROR_INT("invalid conversion type", procName, 1); 00780 if (!plpd || (position == L_LAST_IMAGE)) { 00781 if (!fileout) 00782 return ERROR_INT("fileout not defined", procName, 1); 00783 } 00784 00785 if (pixConvertToPdfData(pix, type, quality, &data, &nbytes, 00786 x, y, res, plpd, position, title)) 00787 return ERROR_INT("pdf data not made", procName, 1); 00788 00789 if (!plpd || (position == L_LAST_IMAGE)) { 00790 ret = l_binaryWrite(fileout, "w", data, nbytes); 00791 FREE(data); 00792 if (ret) 00793 return ERROR_INT("pdf data not written to file", procName, 1); 00794 } 00795 return 0; 00796 } 00797 00798 00799 /*! 00800 * pixConvertToPdfData() 00801 * 00802 * Input: pix (all depths; cmap OK) 00803 * type (L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE) 00804 * quality (used for JPEG only; 0 for default (75)) 00805 * &data (<return> pdf array) 00806 * &nbytes (<return> number of bytes in pdf array) 00807 * x, y (location of lower-left corner of image, in pixels, 00808 * relative to the PostScript origin (0,0) at 00809 * the lower-left corner of the page) 00810 * res (override the resolution of the input image, in ppi; 00811 * use 0 to respect the resolution embedded in the input) 00812 * &lpd (ptr to lpd, which is created on the first invocation 00813 * and returned until last image is processed) 00814 * position (in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE, 00815 * L_LAST_IMAGE) 00816 * title (<optional> pdf title; taken from the first image 00817 * placed on a page; e.g., an input image filename) 00818 * Return: 0 if OK, 1 on error 00819 * 00820 * Notes: 00821 * (1) If @res == 0 and the input resolution field is 0, 00822 * this will use DEFAULT_INPUT_RES. 00823 * (2) This only writes @data if it is the last image to be 00824 * written on the page. 00825 * (3) See comments in convertToPdf(). 00826 */ 00827 l_int32 00828 pixConvertToPdfData(PIX *pix, 00829 l_int32 type, 00830 l_int32 quality, 00831 l_uint8 **pdata, 00832 size_t *pnbytes, 00833 l_int32 x, 00834 l_int32 y, 00835 l_int32 res, 00836 L_PDF_DATA **plpd, 00837 l_int32 position, 00838 const char *title) 00839 { 00840 l_int32 pixres, w, h, d, ret; 00841 l_float32 xpt, ypt, wpt, hpt; 00842 L_COMPRESSED_DATA *cid = NULL; 00843 L_PDF_DATA *lpd = NULL; 00844 PIXCMAP *cmap; 00845 00846 PROCNAME("pixConvertToPdfData"); 00847 00848 if (!pdata) 00849 return ERROR_INT("&data not defined", procName, 1); 00850 *pdata = NULL; 00851 if (!pnbytes) 00852 return ERROR_INT("&nbytes not defined", procName, 1); 00853 *pnbytes = 0; 00854 if (!pix) 00855 return ERROR_INT("pix not defined", procName, 1); 00856 if (plpd) { /* part of multi-page invocation */ 00857 if (position == L_FIRST_IMAGE) 00858 *plpd = NULL; 00859 } 00860 00861 /* Sanity check on requested encoding */ 00862 d = pixGetDepth(pix); 00863 cmap = pixGetColormap(pix); 00864 if (cmap && type != L_FLATE_ENCODE) { 00865 L_WARNING("pix has cmap; using flate encoding", procName); 00866 type = L_FLATE_ENCODE; 00867 } 00868 else if (d < 8 && type == L_JPEG_ENCODE) { 00869 L_WARNING("pix has < 8 bpp; using flate encoding", procName); 00870 type = L_FLATE_ENCODE; 00871 } 00872 else if (d > 1 && type == L_G4_ENCODE) { 00873 L_WARNING("pix has > 1 bpp; using flate encoding", procName); 00874 type = L_FLATE_ENCODE; 00875 } 00876 00877 if (type == L_JPEG_ENCODE) { 00878 if ((cid = pixGenerateJpegData(pix, 0, quality)) == NULL) 00879 return ERROR_INT("jpeg data not made", procName, 1); 00880 pixres = cid->res; 00881 w = cid->w; 00882 h = cid->h; 00883 } 00884 else if (type == L_G4_ENCODE) { 00885 if ((cid = pixGenerateG4Data(pix, 0)) == NULL) 00886 return ERROR_INT("g4 data not made", procName, 1); 00887 pixres = cid->res; 00888 w = cid->w; 00889 h = cid->h; 00890 } 00891 else if (type == L_FLATE_ENCODE) { 00892 if ((cid = pixGenerateFlateData(pix, 0)) == NULL) 00893 return ERROR_INT("flate data not made", procName, 1); 00894 pixres = cid->res; 00895 w = cid->w; 00896 h = cid->h; 00897 } 00898 else 00899 return ERROR_INT("invalid conversion type", procName, 1); 00900 00901 /* Get media box in pts. Guess the input image resolution 00902 * based on the input parameter @res, the resolution data in 00903 * the pix, and the size of the image. */ 00904 if (res <= 0.0) { 00905 if (pixres > 0) 00906 res = pixres; 00907 else 00908 res = DEFAULT_INPUT_RES; 00909 } 00910 xpt = x * 72. / res; 00911 ypt = y * 72. / res; 00912 wpt = w * 72. / res; 00913 hpt = h * 72. / res; 00914 00915 /* Set up lpd */ 00916 if (!plpd) { /* single image */ 00917 if ((lpd = pdfdataCreate(title)) == NULL) 00918 return ERROR_INT("lpd not made", procName, 1); 00919 } 00920 else if (position == L_FIRST_IMAGE) { /* first of multiple images */ 00921 if ((lpd = pdfdataCreate(title)) == NULL) 00922 return ERROR_INT("lpd not made", procName, 1); 00923 *plpd = lpd; 00924 } 00925 else /* not the first of multiple images */ 00926 lpd = *plpd; 00927 00928 /* Add the data to the lpd */ 00929 ptraAdd(lpd->cida, cid); 00930 lpd->n++; 00931 ptaAddPt(lpd->xy, xpt, ypt); 00932 ptaAddPt(lpd->wh, wpt, hpt); 00933 00934 /* If a single image or the last of multiple images, 00935 * generate the pdf and destroy the lpd */ 00936 if (!plpd || (position == L_LAST_IMAGE)) { 00937 ret = l_generatePdf(pdata, pnbytes, lpd); 00938 pdfdataDestroy(&lpd); 00939 if (plpd) *plpd = NULL; 00940 if (ret) 00941 return ERROR_INT("pdf output not made", procName, 1); 00942 } 00943 00944 return 0; 00945 } 00946 00947 00948 /*! 00949 * pixWriteStreamPdf() 00950 * 00951 * Input: fp (stream opened for writing) 00952 * pix (all depths, cmap OK) 00953 * res (override the resolution of the input image, in ppi; 00954 * use 0 to respect the resolution embedded in the input) 00955 * title (<optional> pdf title; taken from the first image 00956 * placed on a page; e.g., an input image filename) 00957 * Return: 0 if OK, 1 on error 00958 * 00959 * Notes: 00960 * (1) This is the simplest interface for writing a single image 00961 * with pdf encoding. It uses G4 encoding for 1 bpp, 00962 * JPEG encoding for 8 bpp (no cmap) and 32 bpp, and FLATE 00963 * encoding for everything else. 00964 */ 00965 l_int32 00966 pixWriteStreamPdf(FILE *fp, 00967 PIX *pix, 00968 l_int32 res, 00969 const char *title) 00970 { 00971 l_uint8 *data; 00972 l_int32 ret, d, type; 00973 size_t nbytes; 00974 PIXCMAP *cmap; 00975 00976 PROCNAME("pixWriteStreamPdf"); 00977 00978 if (!fp) 00979 return ERROR_INT("stream not opened", procName, 1); 00980 if (!pix) 00981 return ERROR_INT("pix not defined", procName, 1); 00982 00983 d = pixGetDepth(pix); 00984 cmap = pixGetColormap(pix); 00985 if (d == 1) 00986 type = L_G4_ENCODE; 00987 else if (cmap || d == 2 || d == 4 || d == 16) 00988 type = L_FLATE_ENCODE; 00989 else /* d == 8 (no cmap) or d == 32 */ 00990 type = L_JPEG_ENCODE; 00991 if (pixConvertToPdfData(pix, type, 75, &data, &nbytes, 00992 0, 0, res, NULL, 0, title)) 00993 return ERROR_INT("pdf data not made", procName, 1); 00994 ret = fwrite(data, 1, nbytes, fp); 00995 00996 FREE(data); 00997 if (ret) 00998 return ERROR_INT("pdf data not written to stream", procName, 1); 00999 return 0; 01000 } 01001 01002 01003 /*---------------------------------------------------------------------* 01004 * Segmented multi-page, multi-image converter * 01005 *---------------------------------------------------------------------*/ 01006 /*! 01007 * convertSegmentedFilesToPdf() 01008 * 01009 * Input: directory name (containing images) 01010 * substr (<optional> substring filter on filenames; can be NULL) 01011 * res (input resolution of all images) 01012 * type (compression type for non-image regions; the 01013 * image regions are always compressed with L_JPEG_ENCODE) 01014 * thresh (used for converting gray --> 1 bpp with L_G4_ENCODE) 01015 * boxaa (of image regions) 01016 * quality (used for JPEG only; 0 for default (75)) 01017 * scalefactor (scaling factor applied to each image region) 01018 * title (<optional> pdf title; if null, taken from the first 01019 * image filename) 01020 * fileout (pdf file of all images) 01021 * Return: 0 if OK, 1 on error 01022 * 01023 * Notes: 01024 * (1) If @substr is not NULL, only image filenames that contain 01025 * the substring can be used. If @substr == NULL, all files 01026 * in the directory are used. 01027 * (2) The files in the directory, after optional filtering by 01028 * the substring, are lexically sorted in increasing order 01029 * before concatenation. 01030 * (3) The images are encoded with G4 if 1 bpp; JPEG if 8 bpp without 01031 * colormap and many colors, or 32 bpp; FLATE for anything else. 01032 * (4) The boxaa contains one boxa of "image regions" for each 01033 * image file. The boxa must all exist, but they can be empty. 01034 * They must be aligned with the sorted set of images. 01035 * (5) The scalefactor is applied to each image region. It is 01036 * typically < 1.0, to save bytes in the final pdf, because 01037 * the resolution is often not critical in non-text regions. 01038 * (6) The non-image regions are automatically scaled up by 2x and 01039 * thresholded if the encoding type is G4. If the non-image 01040 * regions are not encoded with G4, no scaling is performed on them. 01041 */ 01042 l_int32 01043 convertSegmentedFilesToPdf(const char *dirname, 01044 const char *substr, 01045 l_int32 res, 01046 l_int32 type, 01047 l_int32 thresh, 01048 BOXAA *baa, 01049 l_int32 quality, 01050 l_float32 scalefactor, 01051 const char *title, 01052 const char *fileout) 01053 { 01054 char *fname; 01055 l_uint8 *imdata, *data; 01056 l_int32 i, npages, nboxa, nboxes, ret; 01057 size_t imbytes, databytes; 01058 BOXA *boxa; 01059 L_BYTEA *ba; 01060 L_PTRA *pa_data; 01061 SARRAY *sa; 01062 01063 PROCNAME("convertSegmentedFilesToPdf"); 01064 01065 if (!dirname) 01066 return ERROR_INT("dirname not defined", procName, 1); 01067 if (!baa) 01068 return ERROR_INT("baa not defined", procName, 1); 01069 if (!fileout) 01070 return ERROR_INT("fileout not defined", procName, 1); 01071 01072 if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL) 01073 return ERROR_INT("sa not made", procName, 1); 01074 01075 /* Generate and save all the encoded pdf strings */ 01076 npages = sarrayGetCount(sa); 01077 nboxa = boxaaGetCount(baa); 01078 if (npages != nboxa) { 01079 sarrayDestroy(&sa); 01080 return ERROR_INT("npages != nboxa", procName, 1); 01081 } 01082 pa_data = ptraCreate(npages); 01083 for (i = 0; i < npages; i++) { 01084 fname = sarrayGetString(sa, i, L_NOCOPY); 01085 boxa = boxaaGetBoxa(baa, i, L_CLONE); 01086 nboxes = boxaGetCount(boxa); 01087 if (nboxes == 0) 01088 boxaDestroy(&boxa); 01089 ret = convertToPdfDataSegmented(fname, res, type, thresh, boxa, 01090 quality, scalefactor, 01091 &imdata, &imbytes); 01092 boxaDestroy(&boxa); /* safe; in case nboxes > 0 */ 01093 if (ret) { 01094 L_ERROR_STRING("pdf encoding failed for %s", procName, fname); 01095 continue; 01096 } 01097 ba = l_byteaInitFromMem(imdata, imbytes); 01098 if (imdata) FREE(imdata); 01099 ptraAdd(pa_data, ba); 01100 } 01101 sarrayDestroy(&sa); 01102 01103 ptraGetActualCount(pa_data, &npages); 01104 if (npages == 0) { 01105 L_ERROR("no pdf files made", procName); 01106 ptraDestroy(&pa_data, FALSE, FALSE); 01107 return 1; 01108 } 01109 01110 /* Concatenate */ 01111 ret = ptraConcatenatePdfToData(pa_data, NULL, &data, &databytes); 01112 01113 /* Clean up */ 01114 ptraGetActualCount(pa_data, &npages); /* recalculate in case it changes */ 01115 for (i = 0; i < npages; i++) { 01116 ba = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION); 01117 l_byteaDestroy(&ba); 01118 } 01119 ptraDestroy(&pa_data, FALSE, FALSE); 01120 01121 if (ret) { 01122 if (data) FREE(data); 01123 return ERROR_INT("pdf data not made", procName, 1); 01124 } 01125 01126 ret = l_binaryWrite(fileout, "w", data, databytes); 01127 FREE(data); 01128 if (ret) 01129 L_ERROR("pdf data not written to file", procName); 01130 return ret; 01131 } 01132 01133 01134 /*---------------------------------------------------------------------* 01135 * Segmented single page, multi-image converters * 01136 *---------------------------------------------------------------------*/ 01137 /*! 01138 * convertToPdfSegmented() 01139 * 01140 * Input: filein (input image file -- any format) 01141 * res (input image resolution; typ. 300 ppi; use 0 for default) 01142 * type (compression type for non-image regions; the 01143 * image regions are always compressed with L_JPEG_ENCODE) 01144 * thresh (used for converting gray --> 1 bpp with L_G4_ENCODE) 01145 * boxa (of image regions; can be null) 01146 * quality (used for jpeg image regions; 0 for default) 01147 * scalefactor (used for jpeg regions; must be <= 1.0) 01148 * fileout (output pdf file) 01149 * Return: 0 if OK, 1 on error 01150 * 01151 * Notes: 01152 * (1) If there are no image regions, set @boxa == NULL; 01153 * @quality and @scalefactor are ignored. 01154 * (2) Typically, @scalefactor is < 1.0, because the image regions 01155 * can be rendered at a lower resolution (for better compression) 01156 * than the text regions. If @scalefactor == 0, we use 1.0. 01157 * If the input image is 1 bpp and scalefactor < 1.0, we 01158 * use scaleToGray() to downsample the image regions to gray 01159 * before compressing them. 01160 * (3) If the compression type for non-image regions is L_G4_ENCODE 01161 * and bpp > 1, the image is upscaled 2x and thresholded 01162 * to 1 bpp. That is the only situation where @thresh is used. 01163 * (4) The parameter @quality is only used for image regions. 01164 * If @type == L_JPEG_ENCODE, default jpeg quality (75) is 01165 * used for the non-image regions. 01166 * (5) Processing matrix for non-image regions. 01167 * 01168 * Input G4 JPEG FLATE 01169 * ----------|--------------------------------------------------- 01170 * 1 bpp | 1x, 1 bpp 1x flate, 1 bpp 1x, 1 bpp 01171 * | 01172 * cmap | 2x, 1 bpp 1x flate, cmap 1x, cmap 01173 * | 01174 * 2,4 bpp | 2x, 1 bpp 1x flate 1x, 2,4 bpp 01175 * no cmap | 2,4 bpp 01176 * | 01177 * 8,32 bpp | 2x, 1 bpp 1x (jpeg) 1x, 8,32 bpp 01178 * no cmap | 8,32 bpp 01179 * 01180 * Summary: 01181 * (a) if G4 is requested, G4 is used, with 2x upscaling 01182 * for all cases except 1 bpp. 01183 * (b) if JPEG is requested, use flate encoding for all cases 01184 * except 8 bpp without cmap and 32 bpp (rgb). 01185 * (c) if FLATE is requested, use flate with no transformation 01186 * of the raster data. 01187 * (6) Calling options/sequence for these functions: 01188 * file --> file (convertToPdfSegmented) 01189 * pix --> file (pixConvertToPdfSegmented) 01190 * pix --> data (pixConvertToPdfDataSegmented) 01191 * file --> data (convertToPdfDataSegmented) 01192 * pix --> data (pixConvertToPdfDataSegmented) 01193 */ 01194 l_int32 01195 convertToPdfSegmented(const char *filein, 01196 l_int32 res, 01197 l_int32 type, 01198 l_int32 thresh, 01199 BOXA *boxa, 01200 l_int32 quality, 01201 l_float32 scalefactor, 01202 const char *fileout) 01203 { 01204 l_int32 ret; 01205 PIX *pixs; 01206 01207 PROCNAME("convertToPdfSegmented"); 01208 01209 if (!filein) 01210 return ERROR_INT("filein not defined", procName, 1); 01211 if (!fileout) 01212 return ERROR_INT("fileout not defined", procName, 1); 01213 if (type != L_G4_ENCODE && type != L_JPEG_ENCODE && 01214 type != L_FLATE_ENCODE) 01215 return ERROR_INT("invalid conversion type", procName, 1); 01216 if (boxa && scalefactor > 1.0) { 01217 L_WARNING("setting scalefactor to 1.0", procName); 01218 scalefactor = 1.0; 01219 } 01220 01221 if ((pixs = pixRead(filein)) == NULL) 01222 return ERROR_INT("pixs not made", procName, 1); 01223 01224 ret = pixConvertToPdfSegmented(pixs, res, type, thresh, boxa, quality, 01225 scalefactor, fileout, filein); 01226 pixDestroy(&pixs); 01227 return ret; 01228 } 01229 01230 01231 /*! 01232 * pixConvertToPdfSegmented() 01233 * 01234 * Input: pixs (any depth, cmap OK) 01235 * res (input image resolution; typ. 300 ppi; use 0 for default) 01236 * type (compression type for non-image regions; the 01237 * image regions are always compressed with L_JPEG_ENCODE) 01238 * thresh (used for converting gray --> 1 bpp with L_G4_ENCODE) 01239 * boxa (of image regions; can be null) 01240 * quality (used for jpeg image regions; 0 for default) 01241 * scalefactor (used for jpeg regions; must be <= 1.0) 01242 * fileout (output pdf file) 01243 * title (<optional> pdf title; typically taken from the 01244 * input file for the pix) 01245 * Return: 0 if OK, 1 on error 01246 * 01247 * Notes: 01248 * (1) See convertToPdfSegmented() for details. 01249 */ 01250 l_int32 01251 pixConvertToPdfSegmented(PIX *pixs, 01252 l_int32 res, 01253 l_int32 type, 01254 l_int32 thresh, 01255 BOXA *boxa, 01256 l_int32 quality, 01257 l_float32 scalefactor, 01258 const char *fileout, 01259 const char *title) 01260 { 01261 l_uint8 *data; 01262 l_int32 ret; 01263 size_t nbytes; 01264 01265 PROCNAME("pixConvertToPdfSegmented"); 01266 01267 if (!pixs) 01268 return ERROR_INT("pixs not defined", procName, 1); 01269 if (!fileout) 01270 return ERROR_INT("fileout not defined", procName, 1); 01271 if (type != L_G4_ENCODE && type != L_JPEG_ENCODE && 01272 type != L_FLATE_ENCODE) 01273 return ERROR_INT("invalid conversion type", procName, 1); 01274 if (boxa && scalefactor > 1.0) { 01275 L_WARNING("setting scalefactor to 1.0", procName); 01276 scalefactor = 1.0; 01277 } 01278 01279 ret = pixConvertToPdfDataSegmented(pixs, res, type, thresh, boxa, quality, 01280 scalefactor, &data, &nbytes, title); 01281 if (ret) 01282 return ERROR_INT("pdf generation failure", procName, 1); 01283 01284 ret = l_binaryWrite(fileout, "w", data, nbytes); 01285 if (data) FREE(data); 01286 return ret; 01287 } 01288 01289 01290 /*! 01291 * convertToPdfDataSegmented() 01292 * 01293 * Input: filein (input image file -- any format) 01294 * res (input image resolution; typ. 300 ppi; use 0 for default) 01295 * type (compression type for non-image regions; the 01296 * image regions are always compressed with L_JPEG_ENCODE) 01297 * thresh (used for converting gray --> 1 bpp with L_G4_ENCODE) 01298 * boxa (of image regions; can be null) 01299 * quality (used for jpeg image regions; 0 for default) 01300 * scalefactor (used for jpeg regions; must be <= 1.0) 01301 * &data (<return> pdf data in memory) 01302 * &nbytes (<return> number of bytes in pdf data) 01303 * Return: 0 if OK, 1 on error 01304 * 01305 * Notes: 01306 * (1) If there are no image regions, set @boxa == NULL; 01307 * @quality and @scalefactor are ignored. 01308 * (2) Typically, @scalefactor is < 1.0. The image regions are 01309 */ 01310 l_int32 01311 convertToPdfDataSegmented(const char *filein, 01312 l_int32 res, 01313 l_int32 type, 01314 l_int32 thresh, 01315 BOXA *boxa, 01316 l_int32 quality, 01317 l_float32 scalefactor, 01318 l_uint8 **pdata, 01319 size_t *pnbytes) 01320 { 01321 l_int32 ret; 01322 PIX *pixs; 01323 01324 PROCNAME("convertToPdfDataSegmented"); 01325 01326 if (!pdata) 01327 return ERROR_INT("&data not defined", procName, 1); 01328 *pdata = NULL; 01329 if (!pnbytes) 01330 return ERROR_INT("&nbytes not defined", procName, 1); 01331 *pnbytes = 0; 01332 if (!filein) 01333 return ERROR_INT("filein not defined", procName, 1); 01334 if (type != L_G4_ENCODE && type != L_JPEG_ENCODE && 01335 type != L_FLATE_ENCODE) 01336 return ERROR_INT("invalid conversion type", procName, 1); 01337 if (boxa && scalefactor > 1.0) { 01338 L_WARNING("setting scalefactor to 1.0", procName); 01339 scalefactor = 1.0; 01340 } 01341 01342 if ((pixs = pixRead(filein)) == NULL) 01343 return ERROR_INT("pixs not made", procName, 1); 01344 01345 ret = pixConvertToPdfDataSegmented(pixs, res, type, thresh, boxa, 01346 quality, scalefactor, 01347 pdata, pnbytes, filein); 01348 pixDestroy(&pixs); 01349 return ret; 01350 } 01351 01352 01353 /*! 01354 * pixConvertToPdfDataSegmented() 01355 * 01356 * Input: pixs (any depth, cmap OK) 01357 * res (input image resolution; typ. 300 ppi; use 0 for default) 01358 * type (compression type for non-image regions; the 01359 * image regions are always compressed with L_JPEG_ENCODE) 01360 * thresh (used for converting gray --> 1 bpp with L_G4_ENCODE) 01361 * boxa (of image regions; can be null) 01362 * quality (used for jpeg image regions; 0 for default) 01363 * scalefactor (used for jpeg regions; must be <= 1.0) 01364 * &data (<return> pdf data in memory) 01365 * &nbytes (<return> number of bytes in pdf data) 01366 * title (<optional> pdf title; typically taken from the 01367 * input file for the pix) 01368 * Return: 0 if OK, 1 on error 01369 * 01370 * Notes: 01371 * (1) See convertToPdfSegmented() for details. 01372 */ 01373 l_int32 01374 pixConvertToPdfDataSegmented(PIX *pixs, 01375 l_int32 res, 01376 l_int32 type, 01377 l_int32 thresh, 01378 BOXA *boxa, 01379 l_int32 quality, 01380 l_float32 scalefactor, 01381 l_uint8 **pdata, 01382 size_t *pnbytes, 01383 const char *title) 01384 { 01385 l_int32 i, nbox, seq, bx, by, bw, bh, upscale; 01386 l_float32 scale; 01387 BOX *box, *boxc, *box2; 01388 PIX *pix, *pixt1, *pixt2, *pixt3, *pixt4, *pixt5, *pixt6; 01389 PIXCMAP *cmap; 01390 L_PDF_DATA *lpd; 01391 01392 PROCNAME("pixConvertToPdfDataSegmented"); 01393 01394 if (!pdata) 01395 return ERROR_INT("&data not defined", procName, 1); 01396 *pdata = NULL; 01397 if (!pnbytes) 01398 return ERROR_INT("&nbytes not defined", procName, 1); 01399 *pnbytes = 0; 01400 if (!pixs) 01401 return ERROR_INT("pixs not defined", procName, 1); 01402 if (type != L_G4_ENCODE && type != L_JPEG_ENCODE && 01403 type != L_FLATE_ENCODE) 01404 return ERROR_INT("invalid conversion type", procName, 1); 01405 if (boxa && (scalefactor <= 0.0 || scalefactor > 1.0)) { 01406 L_WARNING("setting scalefactor to 1.0", procName); 01407 scalefactor = 1.0; 01408 } 01409 01410 /* Adjust scalefactor so that the product with res gives an integer */ 01411 if (res <= 0) 01412 res = DEFAULT_INPUT_RES; 01413 scale = (l_float32)((l_int32)(scalefactor * res + 0.5)) / (l_float32)res; 01414 cmap = pixGetColormap(pixs); 01415 01416 /* Simple case: single image to be encoded */ 01417 if (!boxa || boxaGetCount(boxa) == 0) { 01418 if (pixGetDepth(pixs) > 1 && type == L_G4_ENCODE) { 01419 if (cmap) 01420 pixt1 = pixRemoveColormap(pixs, REMOVE_CMAP_TO_GRAYSCALE); 01421 else 01422 pixt1 = pixConvertTo8(pixs, FALSE); 01423 pixt2 = pixScaleGray2xLIThresh(pixt1, thresh); 01424 pixConvertToPdfData(pixt2, type, quality, pdata, pnbytes, 01425 0, 0, 2 * res, NULL, 0, title); 01426 pixDestroy(&pixt1); 01427 pixDestroy(&pixt2); 01428 } 01429 else { 01430 pixConvertToPdfData(pixs, type, quality, pdata, pnbytes, 01431 0, 0, res, NULL, 0, title); 01432 } 01433 return 0; 01434 } 01435 01436 /* Multiple images to be encoded. If @type == L_G4_ENCODE, 01437 * jpeg encode a version of pixs that is blanked in the non-image 01438 * regions, and paint the scaled non-image part onto it through a mask. 01439 * Otherwise, we must put the non-image part down first and 01440 * then render all the image regions separately on top of it, 01441 * at their own resolution. */ 01442 pixt1 = pixSetBlackOrWhiteBoxa(pixs, boxa, L_SET_WHITE); /* non-image */ 01443 nbox = boxaGetCount(boxa); 01444 if (type == L_G4_ENCODE) { 01445 pixt2 = pixCreateTemplate(pixs); /* only image regions */ 01446 pixSetBlackOrWhite(pixt2, L_SET_WHITE); 01447 for (i = 0; i < nbox; i++) { 01448 box = boxaGetBox(boxa, i, L_CLONE); 01449 pix = pixClipRectangle(pixs, box, &boxc); 01450 boxGetGeometry(boxc, &bx, &by, &bw, &bh); 01451 pixRasterop(pixt2, bx, by, bw, bh, PIX_SRC, pix, 0, 0); 01452 pixDestroy(&pix); 01453 boxDestroy(&box); 01454 boxDestroy(&boxc); 01455 } 01456 pixt3 = pixRemoveColormap(pixt2, REMOVE_CMAP_BASED_ON_SRC); 01457 if (pixGetDepth(pixt3) == 1) 01458 pixt4 = pixScaleToGray(pixt3, scale); 01459 else 01460 pixt4 = pixScale(pixt3, scale, scale); 01461 pixConvertToPdfData(pixt4, L_JPEG_ENCODE, quality, pdata, pnbytes, 01462 0, 0, (l_int32)(scale * res), 01463 &lpd, L_FIRST_IMAGE, title); 01464 01465 if (pixGetDepth(pixt1) == 1) { 01466 pixt5 = pixClone(pixt1); 01467 upscale = 1; 01468 } 01469 else { 01470 pixt6 = pixConvertTo8(pixt1, 0); 01471 pixt5 = pixScaleGray2xLIThresh(pixt6, thresh); 01472 pixDestroy(&pixt6); 01473 upscale = 2; 01474 } 01475 pixConvertToPdfData(pixt5, L_G4_ENCODE, quality, pdata, pnbytes, 01476 0, 0, upscale * res, &lpd, L_LAST_IMAGE, title); 01477 pixDestroy(&pixt2); 01478 pixDestroy(&pixt3); 01479 pixDestroy(&pixt4); 01480 pixDestroy(&pixt5); 01481 } 01482 else { 01483 /* Put the non-image part down first. This is the full 01484 size of the page, so we can use it to find the page 01485 height in pixels, which is required for determining 01486 the LL corner of the image relative to the LL corner 01487 of the page. */ 01488 pixConvertToPdfData(pixt1, type, quality, pdata, pnbytes, 0, 0, 01489 res, &lpd, L_FIRST_IMAGE, title); 01490 for (i = 0; i < nbox; i++) { 01491 box = boxaGetBox(boxa, i, L_CLONE); 01492 pixt2 = pixClipRectangle(pixs, box, &boxc); 01493 pixt3 = pixRemoveColormap(pixt2, REMOVE_CMAP_BASED_ON_SRC); 01494 if (pixGetDepth(pixt3) == 1) 01495 pixt4 = pixScaleToGray(pixt3, scale); 01496 else 01497 pixt4 = pixScale(pixt3, scale, scale); 01498 box2 = boxTransform(boxc, 0, 0, scale, scale); 01499 boxGetGeometry(box2, &bx, &by, NULL, &bh); 01500 seq = (i == nbox - 1) ? L_LAST_IMAGE : L_NEXT_IMAGE; 01501 pixConvertToPdfData(pixt4, L_JPEG_ENCODE, quality, pdata, pnbytes, 01502 bx, by, (l_int32)(scale * res), 01503 &lpd, seq, title); 01504 pixDestroy(&pixt2); 01505 pixDestroy(&pixt3); 01506 pixDestroy(&pixt4); 01507 boxDestroy(&box); 01508 boxDestroy(&boxc); 01509 boxDestroy(&box2); 01510 } 01511 } 01512 01513 pixDestroy(&pixt1); 01514 return 0; 01515 } 01516 01517 01518 /*---------------------------------------------------------------------* 01519 * Helper functions for generating the output pdf string * 01520 *---------------------------------------------------------------------*/ 01521 /*! 01522 * l_generatePdf() 01523 * 01524 * Input: &data (<return> pdf array) 01525 * &nbytes (<return> number of bytes in pdf array) 01526 * lpd (all the required input image data) 01527 * Return: 0 if OK, 1 on error 01528 * 01529 * Notes: 01530 * (1) On error, no data is returned. 01531 * (2) The objects are: 01532 * 1: Catalog 01533 * 2: Info 01534 * 3: Pages 01535 * 4: Page 01536 * 5: Contents (rendering command) 01537 * 6 to 6+n-1: n XObjects 01538 * 6+n to 6+n+m-1: m colormaps 01539 */ 01540 static l_int32 01541 l_generatePdf(l_uint8 **pdata, 01542 size_t *pnbytes, 01543 L_PDF_DATA *lpd) 01544 { 01545 PROCNAME("l_generatePdf"); 01546 01547 if (!pdata) 01548 return ERROR_INT("&data not defined", procName, 1); 01549 *pdata = NULL; 01550 if (!pnbytes) 01551 return ERROR_INT("&nbytes not defined", procName, 1); 01552 *pnbytes = 0; 01553 if (!lpd) 01554 return ERROR_INT("lpd not defined", procName, 1); 01555 01556 generateFixedStringsPdf(lpd); 01557 generateMediaboxPdf(lpd); 01558 generatePageStringPdf(lpd); 01559 generateContentStringPdf(lpd); 01560 generatePreXStringsPdf(lpd); 01561 generateColormapStringsPdf(lpd); 01562 generateTrailerPdf(lpd); 01563 return generateOutputDataPdf(pdata, pnbytes, lpd); 01564 } 01565 01566 01567 static void 01568 generateFixedStringsPdf(L_PDF_DATA *lpd) 01569 { 01570 char buf[L_SMALLBUF]; 01571 char *version, *datestr; 01572 SARRAY *sa; 01573 01574 /* Accumulate data for the header and objects 1-3 */ 01575 lpd->id = stringNew("%PDF-1.2\n"); 01576 numaAddNumber(lpd->objsize, strlen(lpd->id)); 01577 01578 lpd->obj1 = stringNew("1 0 obj\n" 01579 "<<\n" 01580 "/Type /Catalog\n" 01581 "/Pages 3 0 R\n" 01582 ">>\n" 01583 "endobj\n"); 01584 numaAddNumber(lpd->objsize, strlen(lpd->obj1)); 01585 01586 sa = sarrayCreate(0); 01587 sarrayAddString(sa, (char *)"2 0 obj\n" 01588 "<<\n", L_COPY); 01589 if (lpd->title) { 01590 snprintf(buf, sizeof(buf), "/Title (%s)\n", lpd->title); 01591 sarrayAddString(sa, (char *)buf, L_COPY); 01592 } 01593 if (var_WRITE_DATE_AND_VERSION) { 01594 version = getLeptonicaVersion(); 01595 snprintf(buf, sizeof(buf), 01596 "/Producer (leptonica: %s)\n", version); 01597 FREE(version); 01598 } 01599 else 01600 snprintf(buf, sizeof(buf), "/Producer (leptonica)\n"); 01601 sarrayAddString(sa, (char *)buf, L_COPY); 01602 if (var_WRITE_DATE_AND_VERSION) { 01603 datestr = l_getFormattedDate(); 01604 snprintf(buf, sizeof(buf), "/CreationDate (D:%s)\n", datestr); 01605 sarrayAddString(sa, (char *)buf, L_COPY); 01606 FREE(datestr); 01607 } 01608 sarrayAddString(sa, (char *)">>\n" 01609 "endobj\n", L_COPY); 01610 lpd->obj2 = sarrayToString(sa, 0); 01611 numaAddNumber(lpd->objsize, strlen(lpd->obj2)); 01612 sarrayDestroy(&sa); 01613 01614 lpd->obj3 = stringNew("3 0 obj\n" 01615 "<<\n" 01616 "/Type /Pages\n" 01617 "/Kids [ 4 0 R ]\n" 01618 "/Count 1\n" 01619 ">>\n"); 01620 numaAddNumber(lpd->objsize, strlen(lpd->obj3)); 01621 01622 /* Do the post-datastream string */ 01623 lpd->poststream = stringNew("\n" 01624 "endstream\n" 01625 "endobj\n"); 01626 return; 01627 } 01628 01629 01630 static void 01631 generateMediaboxPdf(L_PDF_DATA *lpd) 01632 { 01633 l_int32 i; 01634 l_float32 xpt, ypt, wpt, hpt, maxx, maxy; 01635 01636 /* First get the full extent of all the images. 01637 * This is the mediabox, in pts. */ 01638 maxx = maxy = 0; 01639 for (i = 0; i < lpd->n; i++) { 01640 ptaGetPt(lpd->xy, i, &xpt, &ypt); 01641 ptaGetPt(lpd->wh, i, &wpt, &hpt); 01642 maxx = L_MAX(maxx, xpt + wpt); 01643 maxy = L_MAX(maxy, ypt + hpt); 01644 } 01645 01646 lpd->mediabox = boxCreate(0, 0, (l_int32)(maxx + 0.5), 01647 (l_int32)(maxy + 0.5)); 01648 01649 /* ypt is in standard image coordinates: the location of 01650 * the UL image corner with respect to the UL media box corner. 01651 * Rewrite each ypt for PostScript coordinates: the location of 01652 * the LL image corner with respect to the LL media box corner. */ 01653 for (i = 0; i < lpd->n; i++) { 01654 ptaGetPt(lpd->xy, i, &xpt, &ypt); 01655 ptaGetPt(lpd->wh, i, &wpt, &hpt); 01656 ptaSetPt(lpd->xy, i, xpt, maxy - ypt - hpt); 01657 } 01658 01659 return; 01660 } 01661 01662 01663 static l_int32 01664 generatePageStringPdf(L_PDF_DATA *lpd) 01665 { 01666 char *buf; 01667 char *xstr; 01668 l_int32 bufsize, i, wpt, hpt; 01669 SARRAY *sa; 01670 01671 PROCNAME("generatePageStringPdf"); 01672 01673 /* Allocate 1000 bytes for the boilerplate text, and 01674 * 50 bytes for each reference to an image in the 01675 * ProcSet array. */ 01676 bufsize = 1000 + 50 * lpd->n; 01677 if ((buf = (char *)CALLOC(bufsize, sizeof(char))) == NULL) 01678 return ERROR_INT("calloc fail for buf", procName, 1); 01679 01680 boxGetGeometry(lpd->mediabox, NULL, NULL, &wpt, &hpt); 01681 sa = sarrayCreate(lpd->n); 01682 for (i = 0; i < lpd->n; i++) { 01683 snprintf(buf, bufsize, "/Im%d %d 0 R ", i + 1, 6 + i); 01684 sarrayAddString(sa, buf, L_COPY); 01685 } 01686 if ((xstr = sarrayToString(sa, 0)) == NULL) 01687 return ERROR_INT("xstr not found", procName, 1); 01688 sarrayDestroy(&sa); 01689 01690 snprintf(buf, bufsize, "4 0 obj\n" 01691 "<<\n" 01692 "/Type /Page\n" 01693 "/Parent 3 0 R\n" 01694 "/MediaBox [%d %d %d %d]\n" 01695 "/Contents 5 0 R\n" 01696 "/Resources\n" 01697 "<<\n" 01698 "/XObject << %s >>\n" 01699 "/ProcSet [ /ImageB /ImageI /ImageC ]\n" 01700 ">>\n" 01701 ">>\n" 01702 "endobj\n", 01703 0, 0, wpt, hpt, xstr); 01704 01705 lpd->obj4 = stringNew(buf); 01706 numaAddNumber(lpd->objsize, strlen(lpd->obj4)); 01707 sarrayDestroy(&sa); 01708 FREE(buf); 01709 FREE(xstr); 01710 return 0; 01711 } 01712 01713 01714 static l_int32 01715 generateContentStringPdf(L_PDF_DATA *lpd) 01716 { 01717 char *buf; 01718 char *cstr; 01719 l_int32 i, bufsize; 01720 l_float32 xpt, ypt, wpt, hpt; 01721 SARRAY *sa; 01722 01723 PROCNAME("generateContentStringPdf"); 01724 01725 bufsize = 1000 + 200 * lpd->n; 01726 if ((buf = (char *)CALLOC(bufsize, sizeof(char))) == NULL) 01727 return ERROR_INT("calloc fail for buf", procName, 1); 01728 01729 sa = sarrayCreate(lpd->n); 01730 for (i = 0; i < lpd->n; i++) { 01731 ptaGetPt(lpd->xy, i, &xpt, &ypt); 01732 ptaGetPt(lpd->wh, i, &wpt, &hpt); 01733 snprintf(buf, bufsize, 01734 "q %.4f %.4f %.4f %.4f %.4f %.4f cm /Im%d Do Q\n", 01735 wpt, 0.0, 0.0, hpt, xpt, ypt, i + 1); 01736 sarrayAddString(sa, buf, L_COPY); 01737 } 01738 if ((cstr = sarrayToString(sa, 0)) == NULL) 01739 return ERROR_INT("cstr not found", procName, 1); 01740 sarrayDestroy(&sa); 01741 01742 snprintf(buf, bufsize, "5 0 obj\n" 01743 "<< /Length %d >>\n" 01744 "stream\n" 01745 "%s" 01746 "endstream\n" 01747 "endobj\n", 01748 (l_int32)strlen(cstr), cstr); 01749 01750 lpd->obj5 = stringNew(buf); 01751 numaAddNumber(lpd->objsize, strlen(lpd->obj5)); 01752 sarrayDestroy(&sa); 01753 FREE(buf); 01754 FREE(cstr); 01755 return 0; 01756 } 01757 01758 01759 static l_int32 01760 generatePreXStringsPdf(L_PDF_DATA *lpd) 01761 { 01762 char buff[256]; 01763 char buf[L_BIGBUF]; 01764 char *cstr, *bstr, *fstr, *xstr; 01765 l_int32 i, cmindex; 01766 L_COMPRESSED_DATA *cid; 01767 SARRAY *sa; 01768 01769 PROCNAME("generatePreXStringsPdf"); 01770 01771 sa = lpd->saprex; 01772 cmindex = 6 + lpd->n; /* starting value */ 01773 for (i = 0; i < lpd->n; i++) { 01774 if ((cid = pdfdataGetCid(lpd, i)) == NULL) 01775 return ERROR_INT("cid not found", procName, 1); 01776 01777 if (cid->type == L_G4_ENCODE) { 01778 if (var_WRITE_G4_IMAGE_MASK) { 01779 cstr = stringNew("/ImageMask true\n" 01780 "/ColorSpace /DeviceGray"); 01781 } 01782 else 01783 cstr = stringNew("/ColorSpace /DeviceGray"); 01784 bstr = stringNew("/BitsPerComponent 1\n" 01785 "/Interpolate true"); 01786 snprintf(buff, sizeof(buff), 01787 "/Filter /CCITTFaxDecode\n" 01788 "/DecodeParms\n" 01789 "<<\n" 01790 "/K -1\n" 01791 "/Columns %d\n" 01792 ">>", cid->w); 01793 fstr = stringNew(buff); 01794 } 01795 else if (cid->type == L_JPEG_ENCODE) { 01796 if (cid->spp == 1) 01797 cstr = stringNew("/ColorSpace /DeviceGray"); 01798 else if (cid->spp == 3) 01799 cstr = stringNew("/ColorSpace /DeviceRGB"); 01800 else 01801 L_ERROR("spp!= 1 && spp != 3", procName); 01802 bstr = stringNew("/BitsPerComponent 8"); 01803 fstr = stringNew("/Filter /DCTDecode"); 01804 } 01805 else { /* type == L_FLATE_ENCODE */ 01806 if (cid->ncolors > 0) { /* cmapped */ 01807 snprintf(buff, sizeof(buff), "/ColorSpace %d 0 R", cmindex++); 01808 cstr = stringNew(buff); 01809 } 01810 else { 01811 if (cid->spp == 1 && cid->bps == 1) 01812 cstr = stringNew("/ColorSpace /DeviceGray\n" 01813 "/Decode [1 0]"); 01814 else if (cid->spp == 1) /* 8 bpp */ 01815 cstr = stringNew("/ColorSpace /DeviceGray"); 01816 else if (cid->spp == 3) 01817 cstr = stringNew("/ColorSpace /DeviceRGB"); 01818 else 01819 L_ERROR("unknown colorspace", procName); 01820 } 01821 snprintf(buff, sizeof(buff), "/BitsPerComponent %d", cid->bps); 01822 bstr = stringNew(buff); 01823 fstr = stringNew("/Filter /FlateDecode"); 01824 } 01825 01826 snprintf(buf, sizeof(buf), 01827 "%d 0 obj\n" 01828 "<<\n" 01829 "/Length %ld\n" 01830 "/Subtype /Image\n" 01831 "%s\n" /* colorspace */ 01832 "/Width %d\n" 01833 "/Height %d\n" 01834 "%s\n" /* bits/component */ 01835 "%s\n" /* filter */ 01836 ">>\n" 01837 "stream\n", 01838 6 + i, cid->nbytescomp, cstr, cid->w, cid->h, bstr, fstr); 01839 xstr = stringNew(buf); 01840 sarrayAddString(sa, xstr, L_INSERT); 01841 numaAddNumber(lpd->objsize, 01842 strlen(xstr) + cid->nbytescomp + strlen(lpd->poststream)); 01843 FREE(cstr); 01844 FREE(bstr); 01845 FREE(fstr); 01846 } 01847 01848 return 0; 01849 } 01850 01851 01852 static l_int32 01853 generateColormapStringsPdf(L_PDF_DATA *lpd) 01854 { 01855 char buf[L_BIGBUF]; 01856 char *cmstr; 01857 l_int32 i, cmindex, ncmap; 01858 L_COMPRESSED_DATA *cid; 01859 SARRAY *sa; 01860 01861 PROCNAME("generateColormapStringsPdf"); 01862 01863 /* In our canonical format, we have 5 objects, followed 01864 * by n XObjects, followed by m colormaps, so the index of 01865 * the first colormap object is 6 + n. */ 01866 sa = lpd->sacmap; 01867 cmindex = 6 + lpd->n; /* starting value */ 01868 ncmap = 0; 01869 for (i = 0; i < lpd->n; i++) { 01870 if ((cid = pdfdataGetCid(lpd, i)) == NULL) 01871 return ERROR_INT("cid not found", procName, 1); 01872 if (cid->ncolors == 0) continue; 01873 01874 ncmap++; 01875 snprintf(buf, sizeof(buf), "%d 0 obj\n" 01876 "[ /Indexed /DeviceRGB\n" 01877 "%d\n" 01878 "%s\n" 01879 "]\n" 01880 "endobj\n", 01881 cmindex, cid->ncolors - 1, cid->cmapdatahex); 01882 cmindex++; 01883 cmstr = stringNew(buf); 01884 numaAddNumber(lpd->objsize, strlen(cmstr)); 01885 sarrayAddString(sa, cmstr, L_INSERT); 01886 } 01887 01888 lpd->ncmap = ncmap; 01889 return 0; 01890 } 01891 01892 01893 static void 01894 generateTrailerPdf(L_PDF_DATA *lpd) 01895 { 01896 l_int32 i, n, size, linestart; 01897 NUMA *naloc, *nasize; 01898 01899 /* Let nobj be the number of numbered objects. These numbered 01900 * objects are indexed by their pdf number in arrays naloc[] 01901 * and nasize[]. The 0th object is the 9 byte header. Then 01902 * the number of objects in nasize, which includes the header, 01903 * is n = nobj + 1. The array naloc[] has n + 1 elements, 01904 * because it includes as the last element the starting 01905 * location of xref. The indexing of these objects, their 01906 * starting locations and sizes are: 01907 * 01908 * Object number Starting location Size 01909 * ------------- ----------------- -------------- 01910 * 0 naloc[0] = 0 nasize[0] = 9 01911 * 1 naloc[1] = 9 nasize[1] = 49 01912 * n naloc[n] nasize[n] 01913 * xref naloc[n+1] 01914 * 01915 * We first generate naloc. 01916 */ 01917 nasize = lpd->objsize; 01918 naloc = lpd->objloc; 01919 linestart = 0; 01920 numaAddNumber(naloc, linestart); /* header */ 01921 n = numaGetCount(nasize); 01922 for (i = 0; i < n; i++) { 01923 numaGetIValue(nasize, i, &size); 01924 linestart += size; 01925 numaAddNumber(naloc, linestart); 01926 } 01927 numaGetIValue(naloc, n, &lpd->xrefloc); /* save it */ 01928 01929 /* Now make the actual trailer string */ 01930 lpd->trailer = makeTrailerStringPdf(naloc); 01931 } 01932 01933 01934 static char * 01935 makeTrailerStringPdf(NUMA *naloc) 01936 { 01937 char *outstr; 01938 char buf[L_BIGBUF]; 01939 l_int32 i, n, linestart, xrefloc; 01940 SARRAY *sa; 01941 01942 PROCNAME("makeTrailerStringPdf"); 01943 01944 if (!naloc) 01945 return (char *)ERROR_PTR("naloc not defined", procName, NULL); 01946 n = numaGetCount(naloc) - 1; /* numbered objects + 1 (yes, +1) */ 01947 01948 sa = sarrayCreate(0); 01949 snprintf(buf, sizeof(buf), "xref\n" 01950 "0 %d\n" 01951 "0000000000 65535 f \n", n); 01952 sarrayAddString(sa, (char *)buf, L_COPY); 01953 for (i = 1; i < n; i++) { 01954 numaGetIValue(naloc, i, &linestart); 01955 snprintf(buf, sizeof(buf), "%010d 00000 n \n", linestart); 01956 sarrayAddString(sa, (char *)buf, L_COPY); 01957 } 01958 01959 numaGetIValue(naloc, n, &xrefloc); 01960 snprintf(buf, sizeof(buf), "trailer\n" 01961 "<<\n" 01962 "/Size %d\n" 01963 "/Root 1 0 R\n" 01964 "/Info 2 0 R\n" 01965 ">>\n" 01966 "startxref\n" 01967 "%d\n" 01968 "%%%%EOF\n", n, xrefloc); 01969 sarrayAddString(sa, (char *)buf, L_COPY); 01970 outstr = sarrayToString(sa, 0); 01971 sarrayDestroy(&sa); 01972 return outstr; 01973 } 01974 01975 01976 /*! 01977 * generateOutputDataPdf() 01978 * 01979 * Input: &data (<return> pdf data array) 01980 * &nbytes (<return> size of pdf data array) 01981 * lpd (input data used to make pdf) 01982 * Return: 0 if OK, 1 on error 01983 * 01984 * Notes: 01985 * (1) Only called from l_generatePdf(). On error, no data is returned. 01986 */ 01987 static l_int32 01988 generateOutputDataPdf(l_uint8 **pdata, 01989 size_t *pnbytes, 01990 L_PDF_DATA *lpd) 01991 { 01992 char *str; 01993 l_uint8 *data; 01994 l_int32 nimages, i, len; 01995 l_int32 *sizes, *locs; 01996 size_t nbytes; 01997 L_COMPRESSED_DATA *cid; 01998 01999 PROCNAME("generateOutputDataPdf"); 02000 02001 if (!pdata) 02002 return ERROR_INT("&data not defined", procName, 1); 02003 *pdata = NULL; 02004 if (!pnbytes) 02005 return ERROR_INT("&nbytes not defined", procName, 1); 02006 nbytes = lpd->xrefloc + strlen(lpd->trailer); 02007 *pnbytes = nbytes; 02008 if ((data = (l_uint8 *)CALLOC(nbytes, sizeof(l_uint8))) == NULL) 02009 return ERROR_INT("calloc fail for data", procName, 1); 02010 *pdata = data; 02011 02012 sizes = numaGetIArray(lpd->objsize); 02013 locs = numaGetIArray(lpd->objloc); 02014 memcpy((char *)data, lpd->id, sizes[0]); 02015 memcpy((char *)(data + locs[1]), lpd->obj1, sizes[1]); 02016 memcpy((char *)(data + locs[2]), lpd->obj2, sizes[2]); 02017 memcpy((char *)(data + locs[3]), lpd->obj3, sizes[3]); 02018 memcpy((char *)(data + locs[4]), lpd->obj4, sizes[4]); 02019 memcpy((char *)(data + locs[5]), lpd->obj5, sizes[5]); 02020 02021 /* Each image has 3 parts: variable preamble, the compressed 02022 * data stream, and the fixed poststream. */ 02023 nimages = lpd->n; 02024 for (i = 0; i < nimages; i++) { 02025 if ((cid = pdfdataGetCid(lpd, i)) == NULL) /* this should not happen */ 02026 return ERROR_INT("cid not found", procName, 1); 02027 str = sarrayGetString(lpd->saprex, i, L_NOCOPY); 02028 len = strlen(str); 02029 memcpy((char *)(data + locs[6 + i]), str, len); 02030 memcpy((char *)(data + locs[6 + i] + len), 02031 (char *)cid->datacomp, cid->nbytescomp); 02032 memcpy((char *)(data + locs[6 + i] + len + cid->nbytescomp), 02033 lpd->poststream, strlen(lpd->poststream)); 02034 } 02035 02036 /* Each colormap is simply a stored string */ 02037 for (i = 0; i < lpd->ncmap; i++) { 02038 str = sarrayGetString(lpd->sacmap, i, L_NOCOPY); 02039 memcpy((char *)(data + locs[6 + nimages + i]), str, strlen(str)); 02040 } 02041 02042 /* And finally the trailer */ 02043 memcpy((char *)(data + lpd->xrefloc), lpd->trailer, strlen(lpd->trailer)); 02044 FREE(sizes); 02045 FREE(locs); 02046 return 0; 02047 } 02048 02049 02050 /*---------------------------------------------------------------------* 02051 * Multi-page concatenation * 02052 *---------------------------------------------------------------------*/ 02053 /*! 02054 * concatenatePdf() 02055 * 02056 * Input: directory name (containing single-page pdf files) 02057 * substr (<optional> substring filter on filenames; can be NULL) 02058 * fileout (concatenated pdf file) 02059 * Return: 0 if OK, 1 on error 02060 * 02061 * Notes: 02062 * (1) This only works with leptonica-formatted single-page pdf files. 02063 * (2) If @substr is not NULL, only filenames that contain 02064 * the substring can be returned. If @substr == NULL, 02065 * none of the filenames are filtered out. 02066 * (3) The files in the directory, after optional filtering by 02067 * the substring, are lexically sorted in increasing order 02068 * before concatenation. 02069 */ 02070 l_int32 02071 concatenatePdf(const char *dirname, 02072 const char *substr, 02073 const char *fileout) 02074 { 02075 l_int32 ret; 02076 SARRAY *sa; 02077 02078 PROCNAME("concatenatePdf"); 02079 02080 if (!dirname) 02081 return ERROR_INT("dirname not defined", procName, 1); 02082 if (!fileout) 02083 return ERROR_INT("fileout not defined", procName, 1); 02084 02085 if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL) 02086 return ERROR_INT("sa not made", procName, 1); 02087 ret = saConcatenatePdf(sa, fileout); 02088 sarrayDestroy(&sa); 02089 return ret; 02090 } 02091 02092 02093 /*! 02094 * saConcatenatePdf() 02095 * 02096 * Input: sarray (of pathnames for single-page pdf files) 02097 * fileout (concatenated pdf file) 02098 * Return: 0 if OK, 1 on error 02099 * 02100 * Notes: 02101 * (1) This only works with leptonica-formatted single-page pdf files. 02102 */ 02103 l_int32 02104 saConcatenatePdf(SARRAY *sa, 02105 const char *fileout) 02106 { 02107 l_uint8 *data; 02108 l_int32 ret; 02109 size_t nbytes; 02110 02111 PROCNAME("saConcatenatePdf"); 02112 02113 if (!sa) 02114 return ERROR_INT("sa not defined", procName, 1); 02115 if (!fileout) 02116 return ERROR_INT("fileout not defined", procName, 1); 02117 02118 ret = saConcatenatePdfToData(sa, &data, &nbytes); 02119 if (ret) 02120 return ERROR_INT("pdf data not made", procName, 1); 02121 ret = l_binaryWrite(fileout, "w", data, nbytes); 02122 FREE(data); 02123 return ret; 02124 } 02125 02126 02127 /*! 02128 * ptraConcatenatePdf() 02129 * 02130 * Input: ptra (array of pdf strings, each for a single-page pdf file) 02131 * fileout (concatenated pdf file) 02132 * Return: 0 if OK, 1 on error 02133 * 02134 * Notes: 02135 * (1) This only works with leptonica-formatted single-page pdf files. 02136 */ 02137 l_int32 02138 ptraConcatenatePdf(L_PTRA *pa, 02139 const char *fileout) 02140 { 02141 l_uint8 *data; 02142 l_int32 ret; 02143 size_t nbytes; 02144 02145 PROCNAME("ptraConcatenatePdf"); 02146 02147 if (!pa) 02148 return ERROR_INT("pa not defined", procName, 1); 02149 if (!fileout) 02150 return ERROR_INT("fileout not defined", procName, 1); 02151 02152 ret = ptraConcatenatePdfToData(pa, NULL, &data, &nbytes); 02153 if (ret) 02154 return ERROR_INT("pdf data not made", procName, 1); 02155 ret = l_binaryWrite(fileout, "w", data, nbytes); 02156 FREE(data); 02157 return ret; 02158 } 02159 02160 02161 /*! 02162 * concatenatePdfToData() 02163 * 02164 * Input: directory name (containing single-page pdf files) 02165 * substr (<optional> substring filter on filenames; can be NULL) 02166 * &data (<return> concatenated pdf data in memory) 02167 * &nbytes (<return> number of bytes in pdf data) 02168 * Return: 0 if OK, 1 on error 02169 * 02170 * Notes: 02171 * (1) This only works with leptonica-formatted single-page pdf files. 02172 * (2) If @substr is not NULL, only filenames that contain 02173 * the substring can be returned. If @substr == NULL, 02174 * none of the filenames are filtered out. 02175 * (3) The files in the directory, after optional filtering by 02176 * the substring, are lexically sorted in increasing order 02177 * before concatenation. 02178 */ 02179 l_int32 02180 concatenatePdfToData(const char *dirname, 02181 const char *substr, 02182 l_uint8 **pdata, 02183 size_t *pnbytes) 02184 { 02185 l_int32 ret; 02186 SARRAY *sa; 02187 02188 PROCNAME("concatenatePdfToData"); 02189 02190 if (!pdata) 02191 return ERROR_INT("&data not defined", procName, 1); 02192 *pdata = NULL; 02193 if (!pnbytes) 02194 return ERROR_INT("&nbytes not defined", procName, 1); 02195 *pnbytes = 0; 02196 if (!dirname) 02197 return ERROR_INT("dirname not defined", procName, 1); 02198 02199 if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL) 02200 return ERROR_INT("sa not made", procName, 1); 02201 ret = saConcatenatePdfToData(sa, pdata, pnbytes); 02202 sarrayDestroy(&sa); 02203 return ret; 02204 } 02205 02206 02207 /*! 02208 * saConcatenatePdfToData() 02209 * 02210 * Input: sarray (of pathnames for single-page pdf files) 02211 * &data (<return> concatenated pdf data in memory) 02212 * &nbytes (<return> number of bytes in pdf data) 02213 * Return: 0 if OK, 1 on error 02214 * 02215 * Notes: 02216 * (1) This only works with leptonica-formatted single-page pdf files. 02217 */ 02218 l_int32 02219 saConcatenatePdfToData(SARRAY *sa, 02220 l_uint8 **pdata, 02221 size_t *pnbytes) 02222 { 02223 char *fname; 02224 l_int32 i, npages, ret; 02225 L_BYTEA *bas; 02226 L_PTRA *pa_data; /* input pdf data for each page */ 02227 02228 PROCNAME("saConcatenatePdfToData"); 02229 02230 if (!pdata) 02231 return ERROR_INT("&data not defined", procName, 1); 02232 *pdata = NULL; 02233 if (!pnbytes) 02234 return ERROR_INT("&nbytes not defined", procName, 1); 02235 *pnbytes = 0; 02236 if (!sa) 02237 return ERROR_INT("sa not defined", procName, 1); 02238 02239 /* Read the pdf files into memory */ 02240 if ((npages = sarrayGetCount(sa)) == 0) 02241 return ERROR_INT("no filenames found", procName, 1); 02242 pa_data = ptraCreate(npages); 02243 for (i = 0; i < npages; i++) { 02244 fname = sarrayGetString(sa, i, L_NOCOPY); 02245 bas = l_byteaInitFromFile(fname); 02246 ptraAdd(pa_data, bas); 02247 } 02248 02249 ret = ptraConcatenatePdfToData(pa_data, sa, pdata, pnbytes); 02250 02251 /* Cleanup: some pages could have been removed */ 02252 ptraGetActualCount(pa_data, &npages); 02253 for (i = 0; i < npages; i++) { 02254 bas = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION); 02255 l_byteaDestroy(&bas); 02256 } 02257 ptraDestroy(&pa_data, FALSE, FALSE); 02258 return ret; 02259 } 02260 02261 02262 /*! 02263 * ptraConcatenatePdfToData() 02264 * 02265 * Input: ptra (array of pdf strings, each for a single-page pdf file) 02266 * sarray (<optional> of pathnames for input pdf files) 02267 * &data (<return> concatenated pdf data in memory) 02268 * &nbytes (<return> number of bytes in pdf data) 02269 * Return: 0 if OK, 1 on error 02270 * 02271 * Notes: 02272 * (1) This only works with leptonica-formatted single-page pdf files. 02273 * pdf files generated by other programs will have unpredictable 02274 * (and usually bad) results. The requirements for each pdf file: 02275 * (a) The Catalog and Info objects are the first two. 02276 * (b) Object 3 is Pages 02277 * (c) Object 4 is Page 02278 * (d) The remaining objects are Contents, XObjects, and ColorSpace 02279 * (2) We remove trailers from each page, and append the full trailer 02280 * for all pages at the end. 02281 * (3) For all but the first file, remove the ID and the first 3 02282 * objects (catalog, info, pages), so that each subsequent 02283 * file has only objects of these classes: 02284 * Page, Contents, XObject, ColorSpace (Indexed RGB). 02285 * For those objects, we substitute these refs to objects 02286 * in the local file: 02287 * Page: Parent(object 3), Contents, XObject(typically multiple) 02288 * XObject: [ColorSpace if indexed] 02289 * The Pages object on the first page (object 3) has a Kids array 02290 * of references to all the Page objects, with a Count equal 02291 * to the number of pages. Each Page object refers back to 02292 * this parent. 02293 */ 02294 l_int32 02295 ptraConcatenatePdfToData(L_PTRA *pa_data, 02296 SARRAY *sa, 02297 l_uint8 **pdata, 02298 size_t *pnbytes) 02299 { 02300 char *fname, *str_pages, *str_trailer; 02301 l_uint8 *pdfdata, *data; 02302 l_int32 i, j, index, nobj, npages; 02303 l_int32 *sizes, *locs; 02304 size_t size; 02305 L_BYTEA *bas, *bad, *bat1, *bat2; 02306 NUMA *na, *na_locs, *na_objs, *napage, *na_sizes, *na_outlocs; 02307 NUMAA *naa_locs; /* object locations on each page */ 02308 NUMAA *naa_objs; /* object mapping numbers to new values */ 02309 02310 PROCNAME("ptraConcatenatePdfToData"); 02311 02312 if (!pdata) 02313 return ERROR_INT("&data not defined", procName, 1); 02314 *pdata = NULL; 02315 if (!pnbytes) 02316 return ERROR_INT("&nbytes not defined", procName, 1); 02317 *pnbytes = 0; 02318 if (!pa_data) 02319 return ERROR_INT("pa_data not defined", procName, 1); 02320 02321 /* Parse the files and find the object locations. 02322 * Remove file data that cannot be parsed. */ 02323 ptraGetActualCount(pa_data, &npages); 02324 naa_locs = numaaCreate(npages); 02325 for (i = 0; i < npages; i++) { 02326 bas = (L_BYTEA *)ptraGetHandle(pa_data, i); 02327 if (parseTrailerPdf(bas, &na_locs) != 0) { 02328 bas = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION); 02329 l_byteaDestroy(&bas); 02330 if (sa) { 02331 fname = sarrayGetString(sa, i, L_NOCOPY); 02332 L_ERROR_STRING("can't parse file %s; skipping", 02333 procName, fname); 02334 } 02335 else { 02336 L_ERROR_INT("can't parse file %d; skipping", procName, i); 02337 } 02338 } 02339 else { 02340 numaaAddNuma(naa_locs, na_locs, L_INSERT); 02341 } 02342 } 02343 02344 /* Recompute npages in case some of the files were not pdf */ 02345 ptraCompactArray(pa_data); 02346 ptraGetActualCount(pa_data, &npages); 02347 if (npages == 0) { 02348 numaaDestroy(&naa_locs); 02349 return ERROR_INT("no parsable pdf files found", procName, 1); 02350 } 02351 02352 /* Find the mapping from initial to final object numbers */ 02353 naa_objs = numaaCreate(npages); /* stores final object numbers */ 02354 napage = numaCreate(npages); /* stores "Page" object numbers */ 02355 index = 0; 02356 for (i = 0; i < npages; i++) { 02357 na = numaaGetNuma(naa_locs, i, L_CLONE); 02358 nobj = numaGetCount(na); 02359 if (i == 0) { 02360 numaAddNumber(napage, 4); /* object 4 on first page */ 02361 na_objs = numaMakeSequence(0.0, 1.0, nobj - 1); 02362 index = nobj - 1; 02363 } 02364 else { /* skip the first 3 objects in each file */ 02365 numaAddNumber(napage, index); /* Page object is first we add */ 02366 na_objs = numaMakeConstant(0.0, nobj - 1); 02367 numaReplaceNumber(na_objs, 3, 3); /* refers to parent of all */ 02368 for (j = 4; j < nobj - 1; j++) 02369 numaSetValue(na_objs, j, index++); 02370 } 02371 numaaAddNuma(naa_objs, na_objs, L_INSERT); 02372 numaDestroy(&na); 02373 } 02374 02375 /* Make the Pages object (#3) */ 02376 str_pages = generatePagesObjStringPdf(napage); 02377 02378 /* Build the output */ 02379 bad = l_byteaCreate(5000); 02380 na_outlocs = numaCreate(0); /* locations of all output objects */ 02381 for (i = 0; i < npages; i++) { 02382 bas = (L_BYTEA *)ptraGetHandle(pa_data, i); 02383 pdfdata = l_byteaGetData(bas, &size); 02384 na_locs = numaaGetNuma(naa_locs, i, L_CLONE); /* locs on this page */ 02385 na_objs = numaaGetNuma(naa_objs, i, L_CLONE); /* obj # on this page */ 02386 nobj = numaGetCount(na_locs) - 1; 02387 na_sizes = numaMakeDelta(na_locs); /* object sizes on this page */ 02388 sizes = numaGetIArray(na_sizes); 02389 locs = numaGetIArray(na_locs); 02390 if (i == 0) { 02391 l_byteaAppendData(bad, pdfdata, sizes[0]); 02392 l_byteaAppendData(bad, pdfdata + locs[1], sizes[1]); 02393 l_byteaAppendData(bad, pdfdata + locs[2], sizes[2]); 02394 l_byteaAppendString(bad, str_pages); 02395 for (j = 0; j < 4; j++) 02396 numaAddNumber(na_outlocs, locs[j]); 02397 } 02398 for (j = 4; j < nobj; j++) { 02399 numaAddNumber(na_outlocs, l_byteaGetSize(bad)); 02400 bat1 = l_byteaInitFromMem(pdfdata + locs[j], sizes[j]); 02401 bat2 = substituteObjectNumbers(bat1, na_objs); 02402 data = l_byteaGetData(bat2, &size); 02403 l_byteaAppendData(bad, data, size); 02404 l_byteaDestroy(&bat1); 02405 l_byteaDestroy(&bat2); 02406 } 02407 if (i == npages - 1) /* last one */ 02408 numaAddNumber(na_outlocs, l_byteaGetSize(bad)); 02409 FREE(sizes); 02410 FREE(locs); 02411 numaDestroy(&na_locs); 02412 numaDestroy(&na_objs); 02413 numaDestroy(&na_sizes); 02414 } 02415 02416 /* Add the trailer */ 02417 str_trailer = makeTrailerStringPdf(na_outlocs); 02418 l_byteaAppendString(bad, str_trailer); 02419 02420 /* Transfer the output data */ 02421 *pdata = l_byteaCopyData(bad, pnbytes); 02422 l_byteaDestroy(&bad); 02423 02424 #if DEBUG_MULTIPAGE 02425 fprintf(stderr, "******** object mapper **********"); 02426 numaaWriteStream(stderr, naa_objs); 02427 02428 fprintf(stderr, "******** Page object numbers ***********"); 02429 numaWriteStream(stderr, napage); 02430 02431 fprintf(stderr, "******** Pages object ***********\n"); 02432 fprintf(stderr, "%s\n", str_pages); 02433 #endif /* DEBUG_MULTIPAGE */ 02434 02435 numaaDestroy(&naa_locs); 02436 numaaDestroy(&naa_objs); 02437 numaDestroy(&napage); 02438 numaDestroy(&na_outlocs); 02439 FREE(str_pages); 02440 FREE(str_trailer); 02441 return 0; 02442 } 02443 02444 02445 /*---------------------------------------------------------------------* 02446 * Helper functions for generating the multi-page pdf output * 02447 *---------------------------------------------------------------------*/ 02448 /*! 02449 * parseTrailerPdf() 02450 * 02451 * Input: bas (lba of a pdf file) 02452 * na (<return> byte locations of the beginning of each object) 02453 * Return: 0 if OK, 1 on error 02454 */ 02455 static l_int32 02456 parseTrailerPdf(L_BYTEA *bas, 02457 NUMA **pna) 02458 { 02459 char *str; 02460 l_uint8 nl = '\n'; 02461 l_uint8 *data; 02462 l_int32 i, j, start, startloc, xrefloc, found, loc, nobj, objno, trailer_ok; 02463 size_t size; 02464 NUMA *na, *naobj, *naxref; 02465 SARRAY *sa; 02466 02467 PROCNAME("parseTrailerPdf"); 02468 02469 if (!pna) 02470 return ERROR_INT("&na not defined", procName, 1); 02471 *pna = NULL; 02472 if (!bas) 02473 return ERROR_INT("bas not defined", procName, 1); 02474 data = l_byteaGetData(bas, &size); 02475 if (strncmp((char *)data, "%PDF-1.", 7) != 0) 02476 return ERROR_INT("PDF header signature not found", procName, 1); 02477 02478 /* Search for "startxref" starting 50 bytes from the EOF */ 02479 start = 0; 02480 if (size > 50) 02481 start = size - 50; 02482 arrayFindSequence(data + start, size - start, 02483 (l_uint8 *)"startxref\n", 10, &loc, &found); 02484 if (!found) 02485 return ERROR_INT("startxref not found!", procName, 1); 02486 if (sscanf((char *)(data + start + loc + 10), "%d\n", &xrefloc) != 1) 02487 return ERROR_INT("xrefloc not found!", procName, 1); 02488 if (xrefloc < 0 || xrefloc >= size) 02489 return ERROR_INT("invalid xrefloc!", procName, 1); 02490 sa = sarrayCreateLinesFromString((char *)(data + xrefloc), 0); 02491 str = sarrayGetString(sa, 1, L_NOCOPY); 02492 if ((sscanf(str, "0 %d", &nobj)) != 1) 02493 return ERROR_INT("nobj not found", procName, 1); 02494 02495 /* Get starting locations. The numa index is the 02496 * object number. loc[0] is the ID; loc[nobj + 1] is xrefloc. */ 02497 na = numaCreate(nobj + 1); 02498 *pna = na; 02499 for (i = 0; i < nobj; i++) { 02500 str = sarrayGetString(sa, i + 2, L_NOCOPY); 02501 sscanf(str, "%d", &startloc); 02502 numaAddNumber(na, startloc); 02503 } 02504 numaAddNumber(na, xrefloc); 02505 02506 #if DEBUG_MULTIPAGE 02507 fprintf(stderr, "************** Trailer string ************\n"); 02508 fprintf(stderr, "xrefloc = %d", xrefloc); 02509 sarrayWriteStream(stderr, sa); 02510 02511 fprintf(stderr, "************** Object locations ************"); 02512 numaWriteStream(stderr, na); 02513 #endif /* DEBUG_MULTIPAGE */ 02514 sarrayDestroy(&sa); 02515 02516 /* Verify correct parsing */ 02517 trailer_ok = TRUE; 02518 for (i = 1; i < nobj; i++) { 02519 numaGetIValue(na, i, &startloc); 02520 if ((sscanf((char *)(data + startloc), "%d 0 obj", &objno)) != 1) { 02521 L_ERROR_INT("bad trailer for object %d", procName, i); 02522 trailer_ok = FALSE; 02523 break; 02524 } 02525 } 02526 02527 /* If the trailer is broken, reconstruct the correct obj locations */ 02528 if (!trailer_ok) { 02529 L_INFO("rebuilding pdf trailer", procName); 02530 numaEmpty(na); 02531 numaAddNumber(na, 0); 02532 l_byteaFindEachSequence(bas, (l_uint8 *)" 0 obj\n", 7, &naobj); 02533 nobj = numaGetCount(naobj); 02534 for (i = 0; i < nobj; i++) { 02535 numaGetIValue(naobj, i, &loc); 02536 for (j = loc - 1; j > 0; j--) { 02537 if (data[j] == nl) 02538 break; 02539 } 02540 numaAddNumber(na, j + 1); 02541 } 02542 l_byteaFindEachSequence(bas, (l_uint8 *)"xref", 4, &naxref); 02543 numaGetIValue(naxref, 0, &loc); 02544 numaAddNumber(na, loc); 02545 numaDestroy(&naobj); 02546 numaDestroy(&naxref); 02547 } 02548 02549 return 0; 02550 } 02551 02552 02553 static char * 02554 generatePagesObjStringPdf(NUMA *napage) 02555 { 02556 char *str, *outstr; 02557 char *buf; 02558 l_int32 i, n, index, bufsize; 02559 SARRAY *sa; 02560 02561 PROCNAME("generatePagesObjStringPdf"); 02562 02563 if (!napage) 02564 return (char *)ERROR_PTR("napage not defined", procName, NULL); 02565 02566 n = numaGetCount(napage); 02567 bufsize = 100 + 16 * n; 02568 buf = (char *)CALLOC(bufsize, sizeof(char)); 02569 sa = sarrayCreate(n); 02570 for (i = 0; i < n; i++) { 02571 numaGetIValue(napage, i, &index); 02572 snprintf(buf, bufsize, " %d 0 R ", index); 02573 sarrayAddString(sa, buf, L_COPY); 02574 } 02575 02576 str = sarrayToString(sa, 0); 02577 snprintf(buf, bufsize, "3 0 obj\n" 02578 "<<\n" 02579 "/Type /Pages\n" 02580 "/Kids [%s]\n" 02581 "/Count %d\n" 02582 ">>\n", str, n); 02583 outstr = stringNew(buf); 02584 sarrayDestroy(&sa); 02585 FREE(str); 02586 FREE(buf); 02587 return outstr; 02588 } 02589 02590 02591 /*! 02592 * substituteObjectNumbers() 02593 * 02594 * Input: bas (lba of a pdf object) 02595 * na_objs (object number mapping array) 02596 * Return: bad (lba of rewritten pdf for the object) 02597 * 02598 * Notes: 02599 * (1) Interpret the first set of bytes as the object number, 02600 * map to the new number, and write it out. 02601 * (2) Find all occurrences of this 4-byte sequence: " 0 R" 02602 * (3) Find the location and value of the integer preceeding this, 02603 * and map it to the new value. 02604 * (4) Rewrite the object with new object numbers. 02605 */ 02606 static L_BYTEA * 02607 substituteObjectNumbers(L_BYTEA *bas, 02608 NUMA *na_objs) 02609 { 02610 l_uint8 space = ' '; 02611 l_uint8 *datas; 02612 l_uint8 buf[32]; /* only needs to hold one integer in ascii format */ 02613 l_int32 start, nrepl, i, j, objin, objout; 02614 l_int32 *objs, *matches; 02615 size_t size; 02616 L_BYTEA *bad; 02617 NUMA *na_match; 02618 02619 datas = l_byteaGetData(bas, &size); 02620 bad = l_byteaCreate(100); 02621 objs = numaGetIArray(na_objs); /* object number mapper */ 02622 02623 /* Substitute the object number on the first line */ 02624 sscanf((char *)datas, "%d", &objin); 02625 objout = objs[objin]; 02626 snprintf((char *)buf, 32, "%d", objout); 02627 l_byteaAppendString(bad, (char *)buf); 02628 02629 /* Find the set of matching locations for object references */ 02630 arrayFindSequence(datas, size, &space, 1, &start, NULL); 02631 na_match = arrayFindEachSequence(datas, size, (l_uint8 *)" 0 R", 4); 02632 if (!na_match) { 02633 l_byteaAppendData(bad, datas + start, size - start); 02634 FREE(objs); 02635 return bad; 02636 } 02637 02638 /* Substitute all the object reference numbers */ 02639 nrepl = numaGetCount(na_match); 02640 matches = numaGetIArray(na_match); 02641 for (i = 0; i < nrepl; i++) { 02642 /* Find the first space before the object number */ 02643 for (j = matches[i] - 1; j > 0; j--) { 02644 if (datas[j] == space) 02645 break; 02646 } 02647 /* Copy bytes from 'start' up to the object number */ 02648 l_byteaAppendData(bad, datas + start, j - start + 1); 02649 sscanf((char *)(datas + j + 1), "%d", &objin); 02650 objout = objs[objin]; 02651 snprintf((char *)buf, 32, "%d", objout); 02652 l_byteaAppendString(bad, (char *)buf); 02653 start = matches[i]; 02654 } 02655 l_byteaAppendData(bad, datas + start, size - start); 02656 02657 FREE(objs); 02658 FREE(matches); 02659 numaDestroy(&na_match); 02660 return bad; 02661 } 02662 02663 02664 /*---------------------------------------------------------------------* 02665 * Create/destroy/access pdf data * 02666 *---------------------------------------------------------------------*/ 02667 static L_PDF_DATA * 02668 pdfdataCreate(const char *title) 02669 { 02670 L_PDF_DATA *lpd; 02671 02672 lpd = (L_PDF_DATA *)CALLOC(1, sizeof(L_PDF_DATA)); 02673 if (title) lpd->title = stringNew(title); 02674 lpd->cida = ptraCreate(10); 02675 lpd->xy = ptaCreate(10); 02676 lpd->wh = ptaCreate(10); 02677 lpd->saprex = sarrayCreate(10); 02678 lpd->sacmap = sarrayCreate(10); 02679 lpd->objsize = numaCreate(20); 02680 lpd->objloc = numaCreate(20); 02681 return lpd; 02682 } 02683 02684 static void 02685 pdfdataDestroy(L_PDF_DATA **plpd) 02686 { 02687 l_int32 i; 02688 L_COMPRESSED_DATA *cid; 02689 L_PDF_DATA *lpd; 02690 02691 PROCNAME("pdfdataDestroy"); 02692 02693 if (plpd== NULL) { 02694 L_WARNING("ptr address is null!", procName); 02695 return; 02696 } 02697 if ((lpd = *plpd) == NULL) 02698 return; 02699 02700 if (lpd->title) FREE(lpd->title); 02701 for (i = 0; i < lpd->n; i++) { 02702 cid = (L_COMPRESSED_DATA *)ptraRemove(lpd->cida, i, L_NO_COMPACTION); 02703 compressed_dataDestroy(&cid); 02704 } 02705 02706 ptraDestroy(&lpd->cida, 0, 0); 02707 if (lpd->id) FREE(lpd->id); 02708 if (lpd->obj1) FREE(lpd->obj1); 02709 if (lpd->obj2) FREE(lpd->obj2); 02710 if (lpd->obj3) FREE(lpd->obj3); 02711 if (lpd->obj4) FREE(lpd->obj4); 02712 if (lpd->obj5) FREE(lpd->obj5); 02713 if (lpd->poststream) FREE(lpd->poststream); 02714 if (lpd->trailer) FREE(lpd->trailer); 02715 if (lpd->xy) ptaDestroy(&lpd->xy); 02716 if (lpd->wh) ptaDestroy(&lpd->wh); 02717 if (lpd->mediabox) boxDestroy(&lpd->mediabox); 02718 if (lpd->saprex) sarrayDestroy(&lpd->saprex); 02719 if (lpd->sacmap) sarrayDestroy(&lpd->sacmap); 02720 if (lpd->objsize) numaDestroy(&lpd->objsize); 02721 if (lpd->objloc) numaDestroy(&lpd->objloc); 02722 FREE(lpd); 02723 *plpd = NULL; 02724 return; 02725 } 02726 02727 02728 static L_COMPRESSED_DATA * 02729 pdfdataGetCid(L_PDF_DATA *lpd, 02730 l_int32 index) 02731 { 02732 PROCNAME("pdfdataGetCid"); 02733 02734 if (!lpd) 02735 return (L_COMPRESSED_DATA *)ERROR_PTR("lpd not defined", 02736 procName, NULL); 02737 if (index < 0 || index >= lpd->n) 02738 return (L_COMPRESSED_DATA *)ERROR_PTR("invalid image index", 02739 procName, NULL); 02740 02741 return (L_COMPRESSED_DATA *)ptraGetHandle(lpd->cida, index); 02742 } 02743 02744 02745 /*---------------------------------------------------------------------* 02746 * Set flags for special modes * 02747 *---------------------------------------------------------------------*/ 02748 /*! 02749 * l_pdfSetG4ImageMask() 02750 * 02751 * Input: flag (1 for writing g4 data as fg only through a mask; 02752 * 0 for writing fg and bg) 02753 * Return: void 02754 * 02755 * Notes: 02756 * (1) The default is for writing only the fg (through the mask). 02757 * That way when you write a 1 bpp image, the bg is transparent, 02758 * so any previously written image remains visible behind it. 02759 */ 02760 void 02761 l_pdfSetG4ImageMask(l_int32 flag) 02762 { 02763 var_WRITE_G4_IMAGE_MASK = flag; 02764 } 02765 02766 02767 /*! 02768 * l_pdfSetDateAndVersion() 02769 * 02770 * Input: flag (1 for writing date/time and leptonica version; 02771 * 0 for omitting this from the metadata) 02772 * Return: void 02773 * 02774 * Notes: 02775 * (1) The default is for writing this data. For regression tests 02776 * that compare output against golden files, it is useful to omit. 02777 */ 02778 void 02779 l_pdfSetDateAndVersion(l_int32 flag) 02780 { 02781 var_WRITE_DATE_AND_VERSION = flag; 02782 } 02783 02784 02785 /* --------------------------------------------*/ 02786 #endif /* USE_PDFIO */ 02787 /* --------------------------------------------*/ 02788