Leptonica 1.68
C Image Processing Library

pdfio.c File Reference

Read/write pdf format from/to file and memory. More...

#include <string.h>
#include <math.h>
#include "allheaders.h"

Go to the source code of this file.

Defines

#define L_SMALLBUF   256
#define L_BIGBUF   2048
#define DEBUG_MULTIPAGE   0

Functions

static l_int32 l_generatePdf (l_uint8 **pdata, size_t *pnbytes, L_PDF_DATA *lpd)
static void generateFixedStringsPdf (L_PDF_DATA *lpd)
static void generateMediaboxPdf (L_PDF_DATA *lpd)
static l_int32 generatePageStringPdf (L_PDF_DATA *lpd)
static l_int32 generateContentStringPdf (L_PDF_DATA *lpd)
static l_int32 generatePreXStringsPdf (L_PDF_DATA *lpd)
static l_int32 generateColormapStringsPdf (L_PDF_DATA *lpd)
static void generateTrailerPdf (L_PDF_DATA *lpd)
static char * makeTrailerStringPdf (NUMA *naloc)
static l_int32 generateOutputDataPdf (l_uint8 **pdata, size_t *pnbytes, L_PDF_DATA *lpd)
static l_int32 parseTrailerPdf (L_BYTEA *bas, NUMA **pna)
static char * generatePagesObjStringPdf (NUMA *napage)
static L_BYTEAsubstituteObjectNumbers (L_BYTEA *bas, NUMA *na_objs)
static L_PDF_DATApdfdataCreate (const char *title)
static void pdfdataDestroy (L_PDF_DATA **plpd)
static L_COMPRESSED_DATApdfdataGetCid (L_PDF_DATA *lpd, l_int32 index)
l_int32 convertFilesToPdf (const char *dirname, const char *substr, l_int32 res, l_float32 scalefactor, l_int32 quality, const char *title, const char *fileout)
l_int32 saConvertFilesToPdf (SARRAY *sa, l_int32 res, l_float32 scalefactor, l_int32 quality, const char *title, const char *fileout)
l_int32 saConvertFilesToPdfData (SARRAY *sa, l_int32 res, l_float32 scalefactor, l_int32 quality, const char *title, l_uint8 **pdata, size_t *pnbytes)
l_int32 selectDefaultPdfEncoding (PIX *pix, l_int32 *ptype)
l_int32 convertToPdf (const char *filein, l_int32 type, l_int32 quality, const char *fileout, l_int32 x, l_int32 y, l_int32 res, L_PDF_DATA **plpd, l_int32 position, const char *title)
l_int32 convertImageDataToPdf (l_uint8 *imdata, size_t size, l_int32 type, l_int32 quality, const char *fileout, l_int32 x, l_int32 y, l_int32 res, L_PDF_DATA **plpd, l_int32 position, const char *title)
l_int32 convertToPdfData (const char *filein, l_int32 type, l_int32 quality, l_uint8 **pdata, size_t *pnbytes, l_int32 x, l_int32 y, l_int32 res, L_PDF_DATA **plpd, l_int32 position, const char *title)
l_int32 convertImageDataToPdfData (l_uint8 *imdata, size_t size, l_int32 type, l_int32 quality, l_uint8 **pdata, size_t *pnbytes, l_int32 x, l_int32 y, l_int32 res, L_PDF_DATA **plpd, l_int32 position, const char *title)
l_int32 pixConvertToPdf (PIX *pix, l_int32 type, l_int32 quality, const char *fileout, l_int32 x, l_int32 y, l_int32 res, L_PDF_DATA **plpd, l_int32 position, const char *title)
l_int32 pixConvertToPdfData (PIX *pix, l_int32 type, l_int32 quality, l_uint8 **pdata, size_t *pnbytes, l_int32 x, l_int32 y, l_int32 res, L_PDF_DATA **plpd, l_int32 position, const char *title)
l_int32 pixWriteStreamPdf (FILE *fp, PIX *pix, l_int32 res, const char *title)
l_int32 convertSegmentedFilesToPdf (const char *dirname, const char *substr, l_int32 res, l_int32 type, l_int32 thresh, BOXAA *baa, l_int32 quality, l_float32 scalefactor, const char *title, const char *fileout)
l_int32 convertToPdfSegmented (const char *filein, l_int32 res, l_int32 type, l_int32 thresh, BOXA *boxa, l_int32 quality, l_float32 scalefactor, const char *fileout)
l_int32 pixConvertToPdfSegmented (PIX *pixs, l_int32 res, l_int32 type, l_int32 thresh, BOXA *boxa, l_int32 quality, l_float32 scalefactor, const char *fileout, const char *title)
l_int32 convertToPdfDataSegmented (const char *filein, l_int32 res, l_int32 type, l_int32 thresh, BOXA *boxa, l_int32 quality, l_float32 scalefactor, l_uint8 **pdata, size_t *pnbytes)
l_int32 pixConvertToPdfDataSegmented (PIX *pixs, l_int32 res, l_int32 type, l_int32 thresh, BOXA *boxa, l_int32 quality, l_float32 scalefactor, l_uint8 **pdata, size_t *pnbytes, const char *title)
l_int32 concatenatePdf (const char *dirname, const char *substr, const char *fileout)
l_int32 saConcatenatePdf (SARRAY *sa, const char *fileout)
l_int32 ptraConcatenatePdf (L_PTRA *pa, const char *fileout)
l_int32 concatenatePdfToData (const char *dirname, const char *substr, l_uint8 **pdata, size_t *pnbytes)
l_int32 saConcatenatePdfToData (SARRAY *sa, l_uint8 **pdata, size_t *pnbytes)
l_int32 ptraConcatenatePdfToData (L_PTRA *pa_data, SARRAY *sa, l_uint8 **pdata, size_t *pnbytes)
void l_pdfSetG4ImageMask (l_int32 flag)
void l_pdfSetDateAndVersion (l_int32 flag)

Variables

static const l_int32 DEFAULT_INPUT_RES = 300
static l_int32 var_WRITE_G4_IMAGE_MASK = 1
static l_int32 var_WRITE_DATE_AND_VERSION = 1

Detailed Description

Read/write pdf format from/to file and memory.

  |=============================================================|
  |                         Important note                      |
  |=============================================================|
  | Some of these functions require libtiff, libjpeg, and libz  |
  | If you do not have these libraries, you must set            |
  |      #define  USE_PDFIO     0                               |
  | in environ.h.  This will link pdfiostub.c                   |
  |=============================================================|

   The first set of functions converts a set of images to a multi-page
   pdf file, with one image on each page.  All images are rendered
   at the same (input) resolution.  The images can be specified as
   being in a directory, or they can be in an sarray.  The output
   pdf can be either a file or an array of bytes in memory.

   The second set of functions implements a pdf output "device driver"
   for wrapping (encoding) any number of images on a single page
   in pdf.  The images can be rendered using a pdf viewer,
   such as gv, evince, xpdf or acroread.
   See: http://www.adobe.com/devnet/pdf/pdf_reference_archive.html

   The third set of functions (segmented) takes an image, an
   optional binary mask, an encoding flag, and some other parameters,
   and generates a single-page mixed raster pdf.

   The fourth set of functions (concatenated) takes a set of single-page
   pdf files and concatenates them into a multi-page pdf

   1. Convert specified image files to Pdf (one image file per page)
        l_int32             convertFilesToPdf()
        l_int32             saConvertFilesToPdf()
        l_int32             saConvertFilesToPdfData()
        l_int32             selectDefaultPdfEncoding()

   2. Single page, multi-image converters
        l_int32             convertToPdf()
        l_int32             convertImageDataToPdf()
        l_int32             convertToPdfData()
        l_int32             convertImageDataToPdfData()
        l_int32             pixConvertToPdf()
        l_int32             pixConvertToPdfData()
        l_int32             pixWriteStreamPdf()

   3. Segmented multi-page, multi-image converter
        l_int32             convertSegmentedFilesToPdf()

   4. Segmented single page, multi-image converters
        l_int32             convertToPdfSegmented()
        l_int32             pixConvertToPdfSegmented()
        l_int32             convertToPdfDataSegmented()
        l_int32             pixConvertToPdfDataSegmented()

   Helper functions for generating the output pdf string
        static l_int32      l_generatePdf()
        static void         generateFixedStringsPdf()
        static void         generateMediaboxPdf()
        static l_int32      generatePageStringPdf()
        static l_int32      generateContentStringPdf()
        static l_int32      generatePreXStringsPdf()
        static l_int32      generateColormapStringsPdf()
        static void         generateTrailerPdf()
        static l_int32      makeTrailerStringPdf()
        static l_int32      generateOutputDataPdf()

   5. Multi-page concatenation
        l_int32             concatenatePdf()
        l_int32             saConcatenatePdf()
        l_int32             ptraConcatenatePdf()
        l_int32             concatenatePdfToData()
        l_int32             saConcatenatePdfToData()
        l_int32             ptraConcatenatePdfToData()

   Helper functions for generating the multi-page pdf output
        static l_int32      parseTrailerPdf()
        static char        *generatePagesObjStringPdf()
        static L_BYTEA     *substituteObjectNumbers()

   Create/destroy/access pdf data
        static L_PDF_DATA         *pdfdataCreate()
        static void                pdfdataDestroy()
        static L_COMPRESSED_DATA  *pdfdataGetCid()

   Set flags for special modes
        void                l_pdfSetG4ImageMask()
        void                l_pdfSetDateAndVersion()

   The top-level multi-image functions can be visualized as follows:
        Output pdf data to file:
           convertToPdf()  and  convertImageDataToPdf()
                   --> pixConvertToPdf()
                         --> pixConvertToPdfData()

        Output pdf data to array in memory:
           convertToPdfData()  and  convertImageDataToPdfData()
                   --> pixConvertToPdfData()

   The top-level segmented image functions can be visualized as follows:
        Output pdf data to file:
           convertToPdfSegmented()
                   --> pixConvertToPdfSegmented()
                         --> pixConvertToPdfDataSegmented()

        Output pdf data to array in memory:
           convertToPdfDataSegmented()
                   --> pixConvertToPdfDataSegmented()

   For multi-page concatenation, there are three different types of input
      (1) directory and optional filename filter
      (2) sarray of filenames
      (3) ptra of byte arrays of pdf data
   and two types of output for the concatenated pdf data
      (1) filename
      (2) data array and size
   High-level interfaces are given for each of the six combinations.

Definition in file pdfio.c.


Define Documentation

#define L_SMALLBUF   256

Definition at line 175 of file pdfio.c.

Referenced by generateFixedStringsPdf().

#define L_BIGBUF   2048
#define DEBUG_MULTIPAGE   0

Definition at line 180 of file pdfio.c.


Function Documentation

static l_int32 l_generatePdf ( l_uint8 **  pdata,
size_t *  pnbytes,
L_PDF_DATA lpd 
) [static]

l_generatePdf()

Input: &data (<return> pdf array) &nbytes (<return> number of bytes in pdf array) lpd (all the required input image data) Return: 0 if OK, 1 on error

Notes: (1) On error, no data is returned. (2) The objects are: 1: Catalog 2: Info 3: Pages 4: Page 5: Contents (rendering command) 6 to 6+n-1: n XObjects 6+n to 6+n+m-1: m colormaps

Definition at line 1541 of file pdfio.c.

References ERROR_INT, generateColormapStringsPdf(), generateContentStringPdf(), generateFixedStringsPdf(), generateMediaboxPdf(), generateOutputDataPdf(), generatePageStringPdf(), generatePreXStringsPdf(), generateTrailerPdf(), NULL, and PROCNAME.

Referenced by pixConvertToPdfData().

static void generateMediaboxPdf ( L_PDF_DATA lpd) [static]
static void generateTrailerPdf ( L_PDF_DATA lpd) [static]
static char * makeTrailerStringPdf ( NUMA naloc) [static]
static l_int32 generateOutputDataPdf ( l_uint8 **  pdata,
size_t *  pnbytes,
L_PDF_DATA lpd 
) [static]

generateOutputDataPdf()

Input: &data (<return> pdf data array) &nbytes (<return> size of pdf data array) lpd (input data used to make pdf) Return: 0 if OK, 1 on error

Notes: (1) Only called from l_generatePdf(). On error, no data is returned.

Definition at line 1988 of file pdfio.c.

References CALLOC, L_Compressed_Data::datacomp, ERROR_INT, FREE, L_Pdf_Data::id, L_NOCOPY, L_Pdf_Data::n, L_Compressed_Data::nbytescomp, L_Pdf_Data::ncmap, NULL, numaGetIArray(), L_Pdf_Data::obj1, L_Pdf_Data::obj2, L_Pdf_Data::obj3, L_Pdf_Data::obj4, L_Pdf_Data::obj5, L_Pdf_Data::objloc, L_Pdf_Data::objsize, pdfdataGetCid(), L_Pdf_Data::poststream, PROCNAME, L_Pdf_Data::sacmap, L_Pdf_Data::saprex, sarrayGetString(), sizes, L_Pdf_Data::trailer, and L_Pdf_Data::xrefloc.

Referenced by l_generatePdf().

static l_int32 parseTrailerPdf ( L_BYTEA bas,
NUMA **  pna 
) [static]
static char * generatePagesObjStringPdf ( NUMA napage) [static]
static L_BYTEA * substituteObjectNumbers ( L_BYTEA bas,
NUMA na_objs 
) [static]

substituteObjectNumbers()

Input: bas (lba of a pdf object) na_objs (object number mapping array) Return: bad (lba of rewritten pdf for the object)

Notes: (1) Interpret the first set of bytes as the object number, map to the new number, and write it out. (2) Find all occurrences of this 4-byte sequence: " 0 R" (3) Find the location and value of the integer preceeding this, and map it to the new value. (4) Rewrite the object with new object numbers.

Definition at line 2607 of file pdfio.c.

References arrayFindEachSequence(), arrayFindSequence(), buf, FREE, l_byteaAppendData(), l_byteaAppendString(), l_byteaCreate(), l_byteaGetData(), NULL, numaDestroy(), numaGetCount(), numaGetIArray(), and size.

Referenced by ptraConcatenatePdfToData().

static L_COMPRESSED_DATA * pdfdataGetCid ( L_PDF_DATA lpd,
l_int32  index 
) [static]
l_int32 convertFilesToPdf ( const char *  dirname,
const char *  substr,
l_int32  res,
l_float32  scalefactor,
l_int32  quality,
const char *  title,
const char *  fileout 
)

convertFilesToPdf()

Input: directory name (containing images) substr (<optional> substring filter on filenames; can be NULL) res (input resolution of all images) scalefactor (scaling factor applied to each image) quality (used for JPEG only; 0 for default (75)) title (<optional> pdf title; if null, taken from the first image filename) fileout (pdf file of all images) Return: 0 if OK, 1 on error

Notes: (1) If is not NULL, only image filenames that contain the substring can be used. If == NULL, all files in the directory are used. (2) The files in the directory, after optional filtering by the substring, are lexically sorted in increasing order before concatenation. (3) The images are encoded with G4 if 1 bpp; JPEG if 8 bpp without colormap and many colors, or 32 bpp; FLATE for anything else.

Definition at line 211 of file pdfio.c.

References ERROR_INT, getSortedPathnamesInDirectory(), NULL, PROCNAME, saConvertFilesToPdf(), and sarrayDestroy().

Referenced by main().

l_int32 saConvertFilesToPdf ( SARRAY sa,
l_int32  res,
l_float32  scalefactor,
l_int32  quality,
const char *  title,
const char *  fileout 
)

saConvertFilesToPdf()

Input: sarray (of pathnames for images) res (input resolution of all images) scalefactor (scaling factor applied to each image) quality (used for JPEG only; 0 for default (75)) title (<optional> pdf title; if null, taken from the first image filename) fileout (pdf file of all images) Return: 0 if OK, 1 on error

Notes: (1) The images are encoded with G4 if 1 bpp; JPEG if 8 bpp without colormap and many colors, or 32 bpp; FLATE for anything else.

Definition at line 254 of file pdfio.c.

References ERROR_INT, FREE, l_binaryWrite(), L_ERROR, PROCNAME, and saConvertFilesToPdfData().

Referenced by convertFilesToPdf().

l_int32 saConvertFilesToPdfData ( SARRAY sa,
l_int32  res,
l_float32  scalefactor,
l_int32  quality,
const char *  title,
l_uint8 **  pdata,
size_t *  pnbytes 
)

saConvertFilesToPdfData()

Input: sarray (of pathnames for images) res (input resolution of all images) scalefactor (scaling factor applied to each image) quality (used for JPEG only; 0 for default (75)) title (<optional> pdf title; if null, taken from the first image filename) &data (<return> output pdf data (of all images) &nbytes (<return> size of output pdf data) Return: 0 if OK, 1 on error

Notes: (1) The images are encoded with G4 if 1 bpp; JPEG if 8 bpp without colormap and many colors, or 32 bpp; FLATE for anything else.

Definition at line 303 of file pdfio.c.

References ERROR_INT, FALSE, FREE, l_byteaDestroy(), l_byteaInitFromMem(), L_ERROR, L_ERROR_STRING, L_NO_COMPACTION, L_NOCOPY, NULL, pixClone(), pixConvertToPdfData(), pixDestroy(), pixRead(), pixScale(), PROCNAME, ptraAdd(), ptraConcatenatePdfToData(), ptraCreate(), ptraDestroy(), ptraGetActualCount(), ptraRemove(), sarrayGetCount(), sarrayGetString(), and selectDefaultPdfEncoding().

Referenced by saConvertFilesToPdf().

l_int32 selectDefaultPdfEncoding ( PIX pix,
l_int32 ptype 
)

selectDefaultPdfEncoding()

Input: pix &type (<return> L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE)

Notes: (1) This attempts to choose an encoding for the pix that results in the smallest file, assuming that if jpeg encoded, it will use quality = 75. The decision is approximate, in that (a) all colormapped images will be losslessly encoded with gzip (flate), and (b) an image with less than about 20 colors is likely to be smaller if flate encoded than if encoded as a jpeg (dct). For example, an image made by pixScaleToGray3() will have 10 colors, and flate encoding will give about twice the compression as jpeg with quality = 75.

Definition at line 395 of file pdfio.c.

References ERROR_INT, L_FLATE_ENCODE, L_G4_ENCODE, L_JPEG_ENCODE, L_MAX, pixGetColormap(), pixGetDimensions(), pixNumColors(), and PROCNAME.

Referenced by saConvertFilesToPdfData().

l_int32 convertToPdf ( const char *  filein,
l_int32  type,
l_int32  quality,
const char *  fileout,
l_int32  x,
l_int32  y,
l_int32  res,
L_PDF_DATA **  plpd,
l_int32  position,
const char *  title 
)

convertToPdf()

Input: filein (input image file -- any format) type (L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE) quality (used for JPEG only; 0 for default (75)) fileout (output pdf file; only required on last image on page) x, y (location of lower-left corner of image, in pixels, relative to the PostScript origin (0,0) at the lower-left corner of the page) res (override the resolution of the input image, in ppi; use 0 to respect the resolution embedded in the input) &lpd (ptr to lpd, which is created on the first invocation and returned until last image is processed, at which time it is destroyed) position (in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE, L_LAST_IMAGE) title (<optional> pdf title; if null, taken from the first image placed on a page; e.g., an input image filename) Return: 0 if OK, 1 on error

Notes: (1) To wrap only one image in pdf, input = NULL, and the value of will be ignored: convertToPdf(... type, quality, x, y, res, NULL, 0); (2) To wrap multiple images on a single pdf page, this is called once for each successive image. Do it this way: L_PDF_DATA *lpd; convertToPdf(... type, quality, x, y, res, &lpd, L_FIRST_IMAGE); convertToPdf(... type, quality, x, y, res, &lpd, L_NEXT_IMAGE); ... convertToPdf(... type, quality, x, y, res, &lpd, L_LAST_IMAGE); This will write the result to the value of specified in the first call; succeeding values of are ignored. On the last call: the pdf data bytes are computed and written to , lpd is destroyed internally, and the returned value of lpd is null. So the client has nothing to clean up. (3) (a) Set == 0 to respect the resolution embedded in the image file. If no resolution is embedded, it will be set to the default value. (b) Set to some other value to override the file resolution. (4) (a) If the input and the resolution of the output device are equal, the image will be "displayed" at the same size as the original. (b) If the input is 72, the output device will render the image at 1 pt/pixel. (c) Some possible choices for the default input pix resolution are: 72 ppi Render pix on any output device at one pt/pixel 96 ppi Windows default for generated display images 300 ppi Typical default for scanned images. We choose 300, which is sensible for rendering page images. However, images come from a variety of sources, and some are explicitly created for viewing on a display.

Definition at line 489 of file pdfio.c.

References convertToPdfData(), ERROR_INT, FREE, l_binaryWrite(), L_FLATE_ENCODE, L_G4_ENCODE, L_JPEG_ENCODE, L_LAST_IMAGE, and PROCNAME.

Referenced by main().

l_int32 convertImageDataToPdf ( l_uint8 imdata,
size_t  size,
l_int32  type,
l_int32  quality,
const char *  fileout,
l_int32  x,
l_int32  y,
l_int32  res,
L_PDF_DATA **  plpd,
l_int32  position,
const char *  title 
)

convertImageDataToPdf()

Input: imdata (array of formatted image data; e.g., png, jpeg) size (size of image data) type (L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE) quality (used for JPEG only; 0 for default (75)) fileout (output pdf file; only required on last image on page) x, y (location of lower-left corner of image, in pixels, relative to the PostScript origin (0,0) at the lower-left corner of the page) res (override the resolution of the input image, in ppi; use 0 to respect the resolution embedded in the input) &lpd (ptr to lpd, which is created on the first invocation and returned until last image is processed, at which time it is destroyed) position (in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE, L_LAST_IMAGE) title (<optional> pdf title; taken from the first image placed on a page; e.g., an input image filename) Return: 0 if OK, 1 on error

Notes: (1) If == 0 and the input resolution field is 0, this will use DEFAULT_INPUT_RES. (2) See comments in convertToPdf().

Definition at line 559 of file pdfio.c.

References ERROR_INT, L_FLATE_ENCODE, L_G4_ENCODE, L_JPEG_ENCODE, L_LAST_IMAGE, NULL, pixConvertToPdf(), pixDestroy(), pixReadMem(), and PROCNAME.

l_int32 convertToPdfData ( const char *  filein,
l_int32  type,
l_int32  quality,
l_uint8 **  pdata,
size_t *  pnbytes,
l_int32  x,
l_int32  y,
l_int32  res,
L_PDF_DATA **  plpd,
l_int32  position,
const char *  title 
)

convertToPdfData()

Input: filein (input image file -- any format) type (L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE) quality (used for JPEG only; 0 for default (75)) &data (<return> pdf data in memory) &nbytes (<return> number of bytes in pdf data) x, y (location of lower-left corner of image, in pixels, relative to the PostScript origin (0,0) at the lower-left corner of the page) res (override the resolution of the input image, in ppi; use 0 to respect the resolution embedded in the input) &lpd (ptr to lpd, which is created on the first invocation and returned until last image is processed, at which time it is destroyed) position (in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE, L_LAST_IMAGE) title (<optional> pdf title; taken from the first image placed on a page; e.g., an input image filename) Return: 0 if OK, 1 on error

Notes: (1) If == 0 and the input resolution field is 0, this will use DEFAULT_INPUT_RES. (2) See comments in convertToPdf().

Definition at line 623 of file pdfio.c.

References ERROR_INT, L_FLATE_ENCODE, L_G4_ENCODE, L_JPEG_ENCODE, NULL, pixConvertToPdfData(), pixDestroy(), pixRead(), and PROCNAME.

Referenced by convertToPdf().

l_int32 convertImageDataToPdfData ( l_uint8 imdata,
size_t  size,
l_int32  type,
l_int32  quality,
l_uint8 **  pdata,
size_t *  pnbytes,
l_int32  x,
l_int32  y,
l_int32  res,
L_PDF_DATA **  plpd,
l_int32  position,
const char *  title 
)

convertImageDataToPdfData()

Input: imdata (array of formatted image data; e.g., png, jpeg) size (size of image data) type (L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE) quality (used for JPEG only; 0 for default (75)) &data (<return> pdf data in memory) &nbytes (<return> number of bytes in pdf data) x, y (location of lower-left corner of image, in pixels, relative to the PostScript origin (0,0) at the lower-left corner of the page) res (override the resolution of the input image, in ppi; use 0 to respect the resolution embedded in the input) &lpd (ptr to lpd, which is created on the first invocation and returned until last image is processed, at which time it is destroyed) position (in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE, L_LAST_IMAGE) title (<optional> pdf title; taken from the first image placed on a page; e.g., an input image filename) Return: 0 if OK, 1 on error

Notes: (1) If == 0 and the input resolution field is 0, this will use DEFAULT_INPUT_RES. (2) See comments in convertToPdf().

Definition at line 690 of file pdfio.c.

References ERROR_INT, L_FIRST_IMAGE, NULL, pixConvertToPdfData(), pixDestroy(), pixReadMem(), and PROCNAME.

l_int32 pixConvertToPdf ( PIX pix,
l_int32  type,
l_int32  quality,
const char *  fileout,
l_int32  x,
l_int32  y,
l_int32  res,
L_PDF_DATA **  plpd,
l_int32  position,
const char *  title 
)

pixConvertToPdf()

Input: pix type (L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE) quality (used for JPEG only; 0 for default (75)) fileout (output pdf file; only required on last image on page) x, y (location of lower-left corner of image, in pixels, relative to the PostScript origin (0,0) at the lower-left corner of the page) res (override the resolution of the input image, in ppi; use 0 to respect the resolution embedded in the input) &lpd (ptr to lpd, which is created on the first invocation and returned until last image is processed) position (in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE, L_LAST_IMAGE) title (<optional> pdf title; taken from the first image placed on a page; e.g., an input image filename) Return: 0 if OK, 1 on error

Notes: (1) If == 0 and the input resolution field is 0, this will use DEFAULT_INPUT_RES. (2) This only writes data to fileout if it is the last image to be written on the page. (3) See comments in convertToPdf().

Definition at line 758 of file pdfio.c.

References ERROR_INT, FREE, l_binaryWrite(), L_FLATE_ENCODE, L_G4_ENCODE, L_JPEG_ENCODE, L_LAST_IMAGE, pixConvertToPdfData(), and PROCNAME.

Referenced by convertImageDataToPdf(), and main().

l_int32 pixConvertToPdfData ( PIX pix,
l_int32  type,
l_int32  quality,
l_uint8 **  pdata,
size_t *  pnbytes,
l_int32  x,
l_int32  y,
l_int32  res,
L_PDF_DATA **  plpd,
l_int32  position,
const char *  title 
)

pixConvertToPdfData()

Input: pix (all depths; cmap OK) type (L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE) quality (used for JPEG only; 0 for default (75)) &data (<return> pdf array) &nbytes (<return> number of bytes in pdf array) x, y (location of lower-left corner of image, in pixels, relative to the PostScript origin (0,0) at the lower-left corner of the page) res (override the resolution of the input image, in ppi; use 0 to respect the resolution embedded in the input) &lpd (ptr to lpd, which is created on the first invocation and returned until last image is processed) position (in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE, L_LAST_IMAGE) title (<optional> pdf title; taken from the first image placed on a page; e.g., an input image filename) Return: 0 if OK, 1 on error

Notes: (1) If == 0 and the input resolution field is 0, this will use DEFAULT_INPUT_RES. (2) This only writes if it is the last image to be written on the page. (3) See comments in convertToPdf().

Definition at line 828 of file pdfio.c.

References L_Pdf_Data::cida, DEFAULT_INPUT_RES, ERROR_INT, L_Compressed_Data::h, L_FIRST_IMAGE, L_FLATE_ENCODE, L_G4_ENCODE, l_generatePdf(), L_JPEG_ENCODE, L_LAST_IMAGE, L_WARNING, L_Pdf_Data::n, NULL, pdfdataCreate(), pdfdataDestroy(), pixGenerateFlateData(), pixGenerateG4Data(), pixGenerateJpegData(), pixGetColormap(), pixGetDepth(), PROCNAME, ptaAddPt(), ptraAdd(), L_Compressed_Data::res, L_Compressed_Data::w, L_Pdf_Data::wh, and L_Pdf_Data::xy.

Referenced by convertImageDataToPdfData(), convertToPdfData(), pixConvertToPdf(), pixConvertToPdfDataSegmented(), pixWriteStreamPdf(), and saConvertFilesToPdfData().

l_int32 pixWriteStreamPdf ( FILE *  fp,
PIX pix,
l_int32  res,
const char *  title 
)

pixWriteStreamPdf()

Input: fp (stream opened for writing) pix (all depths, cmap OK) res (override the resolution of the input image, in ppi; use 0 to respect the resolution embedded in the input) title (<optional> pdf title; taken from the first image placed on a page; e.g., an input image filename) Return: 0 if OK, 1 on error

Notes: (1) This is the simplest interface for writing a single image with pdf encoding. It uses G4 encoding for 1 bpp, JPEG encoding for 8 bpp (no cmap) and 32 bpp, and FLATE encoding for everything else.

Definition at line 966 of file pdfio.c.

References ERROR_INT, FREE, L_FLATE_ENCODE, L_G4_ENCODE, L_JPEG_ENCODE, NULL, pixConvertToPdfData(), pixGetColormap(), pixGetDepth(), and PROCNAME.

Referenced by pixWriteStream().

l_int32 convertSegmentedFilesToPdf ( const char *  dirname,
const char *  substr,
l_int32  res,
l_int32  type,
l_int32  thresh,
BOXAA baa,
l_int32  quality,
l_float32  scalefactor,
const char *  title,
const char *  fileout 
)

convertSegmentedFilesToPdf()

Input: directory name (containing images) substr (<optional> substring filter on filenames; can be NULL) res (input resolution of all images) type (compression type for non-image regions; the image regions are always compressed with L_JPEG_ENCODE) thresh (used for converting gray --> 1 bpp with L_G4_ENCODE) boxaa (of image regions) quality (used for JPEG only; 0 for default (75)) scalefactor (scaling factor applied to each image region) title (<optional> pdf title; if null, taken from the first image filename) fileout (pdf file of all images) Return: 0 if OK, 1 on error

Notes: (1) If is not NULL, only image filenames that contain the substring can be used. If == NULL, all files in the directory are used. (2) The files in the directory, after optional filtering by the substring, are lexically sorted in increasing order before concatenation. (3) The images are encoded with G4 if 1 bpp; JPEG if 8 bpp without colormap and many colors, or 32 bpp; FLATE for anything else. (4) The boxaa contains one boxa of "image regions" for each image file. The boxa must all exist, but they can be empty. They must be aligned with the sorted set of images. (5) The scalefactor is applied to each image region. It is typically < 1.0, to save bytes in the final pdf, because the resolution is often not critical in non-text regions. (6) The non-image regions are automatically scaled up by 2x and thresholded if the encoding type is G4. If the non-image regions are not encoded with G4, no scaling is performed on them.

Definition at line 1043 of file pdfio.c.

References PartitionElement::boxa, boxaaGetBoxa(), boxaaGetCount(), boxaDestroy(), boxaGetCount(), convertToPdfDataSegmented(), ERROR_INT, FALSE, FREE, getSortedPathnamesInDirectory(), l_binaryWrite(), l_byteaDestroy(), l_byteaInitFromMem(), L_CLONE, L_ERROR, L_ERROR_STRING, L_NO_COMPACTION, L_NOCOPY, NULL, PROCNAME, ptraAdd(), ptraConcatenatePdfToData(), ptraCreate(), ptraDestroy(), ptraGetActualCount(), ptraRemove(), sarrayDestroy(), sarrayGetCount(), and sarrayGetString().

Referenced by main().

l_int32 convertToPdfSegmented ( const char *  filein,
l_int32  res,
l_int32  type,
l_int32  thresh,
BOXA boxa,
l_int32  quality,
l_float32  scalefactor,
const char *  fileout 
)

convertToPdfSegmented()

Input: filein (input image file -- any format) res (input image resolution; typ. 300 ppi; use 0 for default) type (compression type for non-image regions; the image regions are always compressed with L_JPEG_ENCODE) thresh (used for converting gray --> 1 bpp with L_G4_ENCODE) boxa (of image regions; can be null) quality (used for jpeg image regions; 0 for default) scalefactor (used for jpeg regions; must be <= 1.0) fileout (output pdf file) Return: 0 if OK, 1 on error

Notes: (1) If there are no image regions, set == NULL; and are ignored. (2) Typically, is < 1.0, because the image regions can be rendered at a lower resolution (for better compression) than the text regions. If == 0, we use 1.0. If the input image is 1 bpp and scalefactor < 1.0, we use scaleToGray() to downsample the image regions to gray before compressing them. (3) If the compression type for non-image regions is L_G4_ENCODE and bpp > 1, the image is upscaled 2x and thresholded to 1 bpp. That is the only situation where is used. (4) The parameter is only used for image regions. If == L_JPEG_ENCODE, default jpeg quality (75) is used for the non-image regions. (5) Processing matrix for non-image regions.

Input G4 JPEG FLATE ----------|--------------------------------------------------- 1 bpp | 1x, 1 bpp 1x flate, 1 bpp 1x, 1 bpp | cmap | 2x, 1 bpp 1x flate, cmap 1x, cmap | 2,4 bpp | 2x, 1 bpp 1x flate 1x, 2,4 bpp no cmap | 2,4 bpp | 8,32 bpp | 2x, 1 bpp 1x (jpeg) 1x, 8,32 bpp no cmap | 8,32 bpp

Summary: (a) if G4 is requested, G4 is used, with 2x upscaling for all cases except 1 bpp. (b) if JPEG is requested, use flate encoding for all cases except 8 bpp without cmap and 32 bpp (rgb). (c) if FLATE is requested, use flate with no transformation of the raster data. (6) Calling options/sequence for these functions: file --> file (convertToPdfSegmented) pix --> file (pixConvertToPdfSegmented) pix --> data (pixConvertToPdfDataSegmented) file --> data (convertToPdfDataSegmented) pix --> data (pixConvertToPdfDataSegmented)

Definition at line 1195 of file pdfio.c.

References ERROR_INT, L_FLATE_ENCODE, L_G4_ENCODE, L_JPEG_ENCODE, L_WARNING, NULL, pixConvertToPdfSegmented(), pixDestroy(), pixRead(), and PROCNAME.

Referenced by main().

l_int32 pixConvertToPdfSegmented ( PIX pixs,
l_int32  res,
l_int32  type,
l_int32  thresh,
BOXA boxa,
l_int32  quality,
l_float32  scalefactor,
const char *  fileout,
const char *  title 
)

pixConvertToPdfSegmented()

Input: pixs (any depth, cmap OK) res (input image resolution; typ. 300 ppi; use 0 for default) type (compression type for non-image regions; the image regions are always compressed with L_JPEG_ENCODE) thresh (used for converting gray --> 1 bpp with L_G4_ENCODE) boxa (of image regions; can be null) quality (used for jpeg image regions; 0 for default) scalefactor (used for jpeg regions; must be <= 1.0) fileout (output pdf file) title (<optional> pdf title; typically taken from the input file for the pix) Return: 0 if OK, 1 on error

Notes: (1) See convertToPdfSegmented() for details.

Definition at line 1251 of file pdfio.c.

References ERROR_INT, FREE, l_binaryWrite(), L_FLATE_ENCODE, L_G4_ENCODE, L_JPEG_ENCODE, L_WARNING, pixConvertToPdfDataSegmented(), and PROCNAME.

Referenced by convertToPdfSegmented().

l_int32 convertToPdfDataSegmented ( const char *  filein,
l_int32  res,
l_int32  type,
l_int32  thresh,
BOXA boxa,
l_int32  quality,
l_float32  scalefactor,
l_uint8 **  pdata,
size_t *  pnbytes 
)

convertToPdfDataSegmented()

Input: filein (input image file -- any format) res (input image resolution; typ. 300 ppi; use 0 for default) type (compression type for non-image regions; the image regions are always compressed with L_JPEG_ENCODE) thresh (used for converting gray --> 1 bpp with L_G4_ENCODE) boxa (of image regions; can be null) quality (used for jpeg image regions; 0 for default) scalefactor (used for jpeg regions; must be <= 1.0) &data (<return> pdf data in memory) &nbytes (<return> number of bytes in pdf data) Return: 0 if OK, 1 on error

Notes: (1) If there are no image regions, set == NULL; and are ignored. (2) Typically, is < 1.0. The image regions are

Definition at line 1311 of file pdfio.c.

References ERROR_INT, L_FLATE_ENCODE, L_G4_ENCODE, L_JPEG_ENCODE, L_WARNING, NULL, pixConvertToPdfDataSegmented(), pixDestroy(), pixRead(), and PROCNAME.

Referenced by convertSegmentedFilesToPdf(), and main().

l_int32 pixConvertToPdfDataSegmented ( PIX pixs,
l_int32  res,
l_int32  type,
l_int32  thresh,
BOXA boxa,
l_int32  quality,
l_float32  scalefactor,
l_uint8 **  pdata,
size_t *  pnbytes,
const char *  title 
)

pixConvertToPdfDataSegmented()

Input: pixs (any depth, cmap OK) res (input image resolution; typ. 300 ppi; use 0 for default) type (compression type for non-image regions; the image regions are always compressed with L_JPEG_ENCODE) thresh (used for converting gray --> 1 bpp with L_G4_ENCODE) boxa (of image regions; can be null) quality (used for jpeg image regions; 0 for default) scalefactor (used for jpeg regions; must be <= 1.0) &data (<return> pdf data in memory) &nbytes (<return> number of bytes in pdf data) title (<optional> pdf title; typically taken from the input file for the pix) Return: 0 if OK, 1 on error

Notes: (1) See convertToPdfSegmented() for details.

Definition at line 1374 of file pdfio.c.

References PartitionElement::box, boxaGetBox(), boxaGetCount(), boxDestroy(), boxGetGeometry(), boxTransform(), DEFAULT_INPUT_RES, ERROR_INT, FALSE, L_CLONE, L_FIRST_IMAGE, L_FLATE_ENCODE, L_G4_ENCODE, L_JPEG_ENCODE, L_LAST_IMAGE, L_NEXT_IMAGE, L_SET_WHITE, L_WARNING, NULL, PIX_SRC, pixClipRectangle(), pixClone(), pixConvertTo8(), pixConvertToPdfData(), pixCreateTemplate(), pixDestroy(), pixGetColormap(), pixGetDepth(), pixRasterop(), pixRemoveColormap(), pixScale(), pixScaleGray2xLIThresh(), pixScaleToGray(), pixSetBlackOrWhite(), pixSetBlackOrWhiteBoxa(), PROCNAME, REMOVE_CMAP_BASED_ON_SRC, and REMOVE_CMAP_TO_GRAYSCALE.

Referenced by convertToPdfDataSegmented(), and pixConvertToPdfSegmented().

l_int32 concatenatePdf ( const char *  dirname,
const char *  substr,
const char *  fileout 
)

concatenatePdf()

Input: directory name (containing single-page pdf files) substr (<optional> substring filter on filenames; can be NULL) fileout (concatenated pdf file) Return: 0 if OK, 1 on error

Notes: (1) This only works with leptonica-formatted single-page pdf files. (2) If is not NULL, only filenames that contain the substring can be returned. If == NULL, none of the filenames are filtered out. (3) The files in the directory, after optional filtering by the substring, are lexically sorted in increasing order before concatenation.

Definition at line 2071 of file pdfio.c.

References ERROR_INT, getSortedPathnamesInDirectory(), NULL, PROCNAME, saConcatenatePdf(), and sarrayDestroy().

Referenced by main().

l_int32 saConcatenatePdf ( SARRAY sa,
const char *  fileout 
)

saConcatenatePdf()

Input: sarray (of pathnames for single-page pdf files) fileout (concatenated pdf file) Return: 0 if OK, 1 on error

Notes: (1) This only works with leptonica-formatted single-page pdf files.

Definition at line 2104 of file pdfio.c.

References ERROR_INT, FREE, l_binaryWrite(), PROCNAME, and saConcatenatePdfToData().

Referenced by concatenatePdf().

l_int32 ptraConcatenatePdf ( L_PTRA pa,
const char *  fileout 
)

ptraConcatenatePdf()

Input: ptra (array of pdf strings, each for a single-page pdf file) fileout (concatenated pdf file) Return: 0 if OK, 1 on error

Notes: (1) This only works with leptonica-formatted single-page pdf files.

Definition at line 2138 of file pdfio.c.

References ERROR_INT, FREE, l_binaryWrite(), NULL, PROCNAME, and ptraConcatenatePdfToData().

l_int32 concatenatePdfToData ( const char *  dirname,
const char *  substr,
l_uint8 **  pdata,
size_t *  pnbytes 
)

concatenatePdfToData()

Input: directory name (containing single-page pdf files) substr (<optional> substring filter on filenames; can be NULL) &data (<return> concatenated pdf data in memory) &nbytes (<return> number of bytes in pdf data) Return: 0 if OK, 1 on error

Notes: (1) This only works with leptonica-formatted single-page pdf files. (2) If is not NULL, only filenames that contain the substring can be returned. If == NULL, none of the filenames are filtered out. (3) The files in the directory, after optional filtering by the substring, are lexically sorted in increasing order before concatenation.

Definition at line 2180 of file pdfio.c.

References ERROR_INT, getSortedPathnamesInDirectory(), NULL, PROCNAME, saConcatenatePdfToData(), and sarrayDestroy().

l_int32 saConcatenatePdfToData ( SARRAY sa,
l_uint8 **  pdata,
size_t *  pnbytes 
)

saConcatenatePdfToData()

Input: sarray (of pathnames for single-page pdf files) &data (<return> concatenated pdf data in memory) &nbytes (<return> number of bytes in pdf data) Return: 0 if OK, 1 on error

Notes: (1) This only works with leptonica-formatted single-page pdf files.

Definition at line 2219 of file pdfio.c.

References ERROR_INT, FALSE, l_byteaDestroy(), l_byteaInitFromFile(), L_NO_COMPACTION, L_NOCOPY, NULL, PROCNAME, ptraAdd(), ptraConcatenatePdfToData(), ptraCreate(), ptraDestroy(), ptraGetActualCount(), ptraRemove(), sarrayGetCount(), and sarrayGetString().

Referenced by concatenatePdfToData(), and saConcatenatePdf().

l_int32 ptraConcatenatePdfToData ( L_PTRA pa_data,
SARRAY sa,
l_uint8 **  pdata,
size_t *  pnbytes 
)

ptraConcatenatePdfToData()

Input: ptra (array of pdf strings, each for a single-page pdf file) sarray (<optional> of pathnames for input pdf files) &data (<return> concatenated pdf data in memory) &nbytes (<return> number of bytes in pdf data) Return: 0 if OK, 1 on error

Notes: (1) This only works with leptonica-formatted single-page pdf files. pdf files generated by other programs will have unpredictable (and usually bad) results. The requirements for each pdf file: (a) The Catalog and Info objects are the first two. (b) Object 3 is Pages (c) Object 4 is Page (d) The remaining objects are Contents, XObjects, and ColorSpace (2) We remove trailers from each page, and append the full trailer for all pages at the end. (3) For all but the first file, remove the ID and the first 3 objects (catalog, info, pages), so that each subsequent file has only objects of these classes: Page, Contents, XObject, ColorSpace (Indexed RGB). For those objects, we substitute these refs to objects in the local file: Page: Parent(object 3), Contents, XObject(typically multiple) XObject: [ColorSpace if indexed] The Pages object on the first page (object 3) has a Kids array of references to all the Page objects, with a Count equal to the number of pages. Each Page object refers back to this parent.

Definition at line 2295 of file pdfio.c.

References ERROR_INT, FREE, generatePagesObjStringPdf(), l_byteaAppendData(), l_byteaAppendString(), l_byteaCopyData(), l_byteaCreate(), l_byteaDestroy(), l_byteaGetData(), l_byteaGetSize(), l_byteaInitFromMem(), L_CLONE, L_ERROR_INT, L_ERROR_STRING, L_INSERT, L_NO_COMPACTION, L_NOCOPY, makeTrailerStringPdf(), NULL, numaaAddNuma(), numaaCreate(), numaAddNumber(), numaaDestroy(), numaaGetNuma(), numaaWriteStream(), numaCreate(), numaDestroy(), numaGetCount(), numaGetIArray(), numaMakeConstant(), numaMakeDelta(), numaMakeSequence(), numaReplaceNumber(), numaSetValue(), numaWriteStream(), parseTrailerPdf(), PROCNAME, ptraCompactArray(), ptraGetActualCount(), ptraGetHandle(), ptraRemove(), sarrayGetString(), size, sizes, and substituteObjectNumbers().

Referenced by convertSegmentedFilesToPdf(), ptraConcatenatePdf(), saConcatenatePdfToData(), and saConvertFilesToPdfData().

void l_pdfSetG4ImageMask ( l_int32  flag)

l_pdfSetG4ImageMask()

Input: flag (1 for writing g4 data as fg only through a mask; 0 for writing fg and bg) Return: void

Notes: (1) The default is for writing only the fg (through the mask). That way when you write a 1 bpp image, the bg is transparent, so any previously written image remains visible behind it.

Definition at line 2761 of file pdfio.c.

References var_WRITE_G4_IMAGE_MASK.

Referenced by main().

void l_pdfSetDateAndVersion ( l_int32  flag)

l_pdfSetDateAndVersion()

Input: flag (1 for writing date/time and leptonica version; 0 for omitting this from the metadata) Return: void

Notes: (1) The default is for writing this data. For regression tests that compare output against golden files, it is useful to omit.

Definition at line 2779 of file pdfio.c.

References var_WRITE_DATE_AND_VERSION.

Referenced by main().


Variable Documentation

const l_int32 DEFAULT_INPUT_RES = 300 [static]

Definition at line 144 of file pdfio.c.

Referenced by pixConvertToPdfData(), and pixConvertToPdfDataSegmented().

Definition at line 171 of file pdfio.c.

Referenced by generatePreXStringsPdf(), and l_pdfSetG4ImageMask().

Definition at line 173 of file pdfio.c.

Referenced by generateFixedStringsPdf(), and l_pdfSetDateAndVersion().

 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Defines