Unsupervised classification of connected components and words; image comparison using word bounding boxes. More...

#include <string.h>
#include "allheaders.h"

Defines
#define	L_BUF_SIZE 512
Functions
static l_int32	testLineAlignmentX (NUMA na1, NUMA na2, l_int32 shiftx, l_int32 delx, l_int32 nperline)
static l_int32	countAlignedMatches (NUMA nai1, NUMA nai2, NUMA nasx, NUMA nasy, l_int32 n1, l_int32 n2, l_int32 delx, l_int32 dely, l_int32 nreq, l_int32 *psame, l_int32 debugflag)
static void	printRowIndices (l_int32 index1, l_int32 n1, l_int32 index2, l_int32 n2)
l_int32	jbCorrelation (const char dirin, l_float32 thresh, l_float32 weight, l_int32 components, const char rootname, l_int32 firstpage, l_int32 npages, l_int32 renderflag)
l_int32	jbRankHaus (const char dirin, l_int32 size, l_float32 rank, l_int32 components, const char rootname, l_int32 firstpage, l_int32 npages, l_int32 renderflag)
JBCLASSER *	jbWordsInTextlines (const char dirin, l_int32 reduction, l_int32 maxwidth, l_int32 maxheight, l_float32 thresh, l_float32 weight, NUMA *pnatl, l_int32 firstpage, l_int32 npages)
l_int32	pixGetWordsInTextlines (PIX pixs, l_int32 reduction, l_int32 minwidth, l_int32 minheight, l_int32 maxwidth, l_int32 maxheight, BOXA pboxad, PIXA ppixad, NUMA *pnai)
l_int32	pixGetWordBoxesInTextlines (PIX pixs, l_int32 reduction, l_int32 minwidth, l_int32 minheight, l_int32 maxwidth, l_int32 maxheight, BOXA pboxad, NUMA *pnai)
NUMAA *	boxaExtractSortedPattern (BOXA boxa, NUMA na)
l_int32	numaaCompareImagesByBoxes (NUMAA naa1, NUMAA naa2, l_int32 nperline, l_int32 nreq, l_int32 maxshiftx, l_int32 maxshifty, l_int32 delx, l_int32 dely, l_int32 *psame, l_int32 debugflag)
Variables
static const l_int32	JB_WORDS_MIN_WIDTH = 5
static const l_int32	JB_WORDS_MIN_HEIGHT = 3

Detailed Description

Unsupervised classification of connected components and words; image comparison using word bounding boxes.

    Top-level jb2 correlation and rank-hausdorff

       l_int32         jbCorrelation()
       l_int32         jbRankHaus()

    Extract and classify words in textline order

       JBCLASSER      *jbWordsInTextlines()
       l_int32         pixGetWordsInTextlines()
       l_int32         pixGetWordBoxesInTextlines()

    Use word bounding boxes to compare page images

       NUMAA          *boxaExtractSortedPattern()
       l_int32         numaaCompareImagesByBoxes()
       static l_int32  testLineAlignmentX()
       static l_int32  countAlignedMatches()
       static void     printRowIndices()

Definition in file classapp.c.

Define Documentation

#define L_BUF_SIZE 512

Definition at line 47 of file classapp.c.

Referenced by jbCorrelation(), and jbRankHaus().

Function Documentation

static l_int32 testLineAlignmentX	(	NUMA *	na1,
		NUMA *	na2,
		l_int32	shiftx,
		l_int32	delx,
		l_int32	nperline
	)		`[static]`

Definition at line 775 of file classapp.c.

References ERROR_INT, L_ABS, numaGetIValue(), and PROCNAME.

Referenced by numaaCompareImagesByBoxes().

static l_int32 countAlignedMatches	(	NUMA *	nai1,
		NUMA *	nai2,
		NUMA *	nasx,
		NUMA *	nasy,
		l_int32	n1,
		l_int32	n2,
		l_int32	delx,
		l_int32	dely,
		l_int32	nreq,
		l_int32 *	psame,
		l_int32	debugflag
	)		`[static]`

Definition at line 823 of file classapp.c.

References CALLOC, ERROR_INT, FREE, L_ABS, numaGetCount(), numaGetIArray(), printRowIndices(), and PROCNAME.

Referenced by numaaCompareImagesByBoxes().

static void printRowIndices	(	l_int32 *	index1,
		l_int32	n1,
		l_int32 *	index2,
		l_int32	n2
	)		`[static]`

Definition at line 910 of file classapp.c.

Referenced by countAlignedMatches().

l_int32 jbCorrelation	(	const char *	dirin,
		l_float32	thresh,
		l_float32	weight,
		l_int32	components,
		const char *	rootname,
		l_int32	firstpage,
		l_int32	npages,
		l_int32	renderflag
	)

jbCorrelation()

Input: dirin (directory of input images) thresh (typically ~0.8) weight (typically ~0.6) components (JB_CONN_COMPS, JB_CHARACTERS, JB_WORDS) rootname (for output files) firstpage (0-based) npages (use 0 for all pages in dirin) renderflag (1 to render from templates; 0 to skip) Return: 0 if OK, 1 on error

Notes: (1) The images must be 1 bpp. If they are not, you can convert them using convertFilesTo1bpp(). (2) See prog/jbcorrelation for generating more output (e.g., for debugging)

Definition at line 84 of file classapp.c.

References ERROR_INT, FALSE, filename, getSortedPathnamesInDirectory(), IFF_PNG, JB_CHARACTERS, JB_CONN_COMPS, JB_WORDS, jbAddPages(), jbClasserDestroy(), jbCorrelationInit(), jbDataDestroy(), jbDataRender(), jbDataSave(), jbDataWrite(), L_BUF_SIZE, L_CLONE, nfiles, NULL, CCBorda::pix, pixaDestroy(), pixaGetCount(), pixaGetPix(), pixDestroy(), pixWrite(), PROCNAME, sarrayDestroy(), and sarrayGetCount().

Referenced by main().

l_int32 jbRankHaus	(	const char *	dirin,
		l_int32	size,
		l_float32	rank,
		l_int32	components,
		const char *	rootname,
		l_int32	firstpage,
		l_int32	npages,
		l_int32	renderflag
	)

jbRankHaus()

Input: dirin (directory of input images) size (of Sel used for dilation; typ. 2) rank (rank value of match; typ. 0.97) components (JB_CONN_COMPS, JB_CHARACTERS, JB_WORDS) rootname (for output files) firstpage (0-based) npages (use 0 for all pages in dirin) renderflag (1 to render from templates; 0 to skip) Return: 0 if OK, 1 on error

Notes: (1) See prog/jbrankhaus for generating more output (e.g., for debugging)

Definition at line 164 of file classapp.c.

References ERROR_INT, FALSE, filename, getSortedPathnamesInDirectory(), IFF_PNG, JB_CHARACTERS, JB_CONN_COMPS, JB_WORDS, jbAddPages(), jbClasserDestroy(), jbDataDestroy(), jbDataRender(), jbDataSave(), jbDataWrite(), jbRankHausInit(), L_BUF_SIZE, L_CLONE, nfiles, NULL, CCBorda::pix, pixaDestroy(), pixaGetCount(), pixaGetPix(), pixDestroy(), pixWrite(), PROCNAME, sarrayDestroy(), and sarrayGetCount().

Referenced by main().

JBCLASSER* jbWordsInTextlines	(	const char *	dirin,
		l_int32	reduction,
		l_int32	maxwidth,
		l_int32	maxheight,
		l_float32	thresh,
		l_float32	weight,
		NUMA **	pnatl,
		l_int32	firstpage,
		l_int32	npages
	)

jbWordsInTextlines()

Input: dirin (directory of input pages) reduction (1 for full res; 2 for half-res) maxwidth (of word mask components, to be kept) maxheight (of word mask components, to be kept) thresh (on correlation; 0.80 is reasonable) weight (for handling thick text; 0.6 is reasonable) natl (<return> numa with textline index for each component) firstpage (0-based) npages (use 0 for all pages in dirin) Return: classer (for the set of pages)

Notes: (1) This is a high-level function. See prog/jbwords for example of usage. (2) Typically, words can be found reasonably well at a resolution of about 150 ppi. For highest accuracy, you should use 300 ppi. Assuming that the input images are 300 ppi, use reduction = 1 for finding words at full res, and reduction = 2 for finding them at 150 ppi.

Definition at line 254 of file classapp.c.

References boxaDestroy(), ERROR_PTR, getSortedPathnamesInDirectory(), JbClasser::h, CCBorda::h, JB_WORDS, JB_WORDS_MIN_HEIGHT, JB_WORDS_MIN_WIDTH, jbAddPageComponents(), jbCorrelationInit(), L_WARNING_INT, nfiles, NULL, numaCreate(), numaDestroy(), numaJoin(), CCBorda::pix, pixaDestroy(), pixDestroy(), pixGetDimensions(), pixGetWordsInTextlines(), pixRead(), PROCNAME, JbClasser::safiles, sarrayCopy(), sarrayDestroy(), sarrayGetCount(), sarrayGetString(), JbClasser::w, and CCBorda::w.

Referenced by main().

l_int32 pixGetWordsInTextlines	(	PIX *	pixs,
		l_int32	reduction,
		l_int32	minwidth,
		l_int32	minheight,
		l_int32	maxwidth,
		l_int32	maxheight,
		BOXA **	pboxad,
		PIXA **	ppixad,
		NUMA **	pnai
	)

pixGetWordsInTextlines()

Input: pixs (1 bpp, 300 ppi) reduction (1 for full res; 2 for half-res) minwidth, minheight (of saved components; smaller are discarded) maxwidth, maxheight (of saved components; larger are discarded) &boxad (<return> word boxes sorted in textline line order) &pixad (<return> word images sorted in textline line order) &naindex (<return> index of textline for each word) Return: 0 if OK, 1 on error

Notes: (1) The input should be at a resolution of about 300 ppi. The word masks can be computed at either 150 ppi or 300 ppi. For the former, set reduction = 2. (2) The four size constraints on saved components are all used at 2x reduction. (3) The result are word images (and their b.b.), extracted in textline order, all at 2x reduction, and with a numa giving the textline index for each word. (4) The pixa and boxa interfaces should make this type of application simple to put together. The steps are:

generate first estimate of word masks
get b.b. of these, and remove the small and big ones
extract pixa of the word mask from these boxes
extract pixa of the actual word images, using word masks
sort actual word images in textline order (2d)
flatten them to a pixa (1d), saving the textline index for each pix (5) In an actual application, it may be desirable to pre-filter the input image to remove large components, to extract single columns of text, and to deskew them. For example, to remove both large components and small noisy components that can interfere with the statistics used to estimate parameters for segmenting by words, but still retain text lines, the following image preprocessing can be done: Pix *pixt = pixMorphSequence(pixs, "c40.1", 0); Pix *pixf = pixSelectBySize(pixt, 0, 60, 8, L_SELECT_HEIGHT, L_SELECT_IF_LT, NULL); pixAnd(pixf, pixf, pixs); // the filtered image The closing turns text lines into long blobs, but does not significantly increase their height. But if there are many small connected components in a dense texture, this is likely to generate tall components that will be eliminated in pixf.

Definition at line 369 of file classapp.c.

References boxaaDestroy(), boxaDestroy(), boxaSelectBySize(), boxaSort2d(), ERROR_INT, L_CLONE, L_COPY, L_SELECT_IF_BOTH, L_SELECT_IF_GTE, L_SELECT_IF_LTE, maxsize, NULL, numaaDestroy(), pixaaDestroy(), pixaaFlattenToPixa(), pixaClipToPix(), pixaCreateFromBoxa(), pixaDestroy(), pixaGetBoxa(), pixaSort2dByIndex(), pixClone(), pixConnComp(), pixDestroy(), pixReduceRankBinaryCascade(), pixWordMaskByDilation(), and PROCNAME.

Referenced by jbWordsInTextlines().

l_int32 pixGetWordBoxesInTextlines	(	PIX *	pixs,
		l_int32	reduction,
		l_int32	minwidth,
		l_int32	minheight,
		l_int32	maxwidth,
		l_int32	maxheight,
		BOXA **	pboxad,
		NUMA **	pnai
	)

pixGetWordBoxesInTextlines()

Input: pixs (1 bpp, 300 ppi) reduction (1 for full res; 2 for half-res) minwidth, minheight (of saved components; smaller are discarded) maxwidth, maxheight (of saved components; larger are discarded) &boxad (<return> word boxes sorted in textline line order) &naindex (<return> index of textline for each word) Return: 0 if OK, 1 on error

Notes: (1) The input should be at a resolution of about 300 ppi. The word masks can be computed at either 150 ppi or 300 ppi. For the former, set reduction = 2. (2) In an actual application, it may be desirable to pre-filter the input image to remove large components, to extract single columns of text, and to deskew them. (3) This is a special version that just finds the word boxes in line order, with a numa giving the textline index for each word. See pixGetWordsInTextlines() for more details.

Definition at line 477 of file classapp.c.

References boxaaDestroy(), boxaaFlattenToBoxa(), boxaDestroy(), boxaSelectBySize(), boxaSort2d(), ERROR_INT, L_CLONE, L_SELECT_IF_BOTH, L_SELECT_IF_GTE, L_SELECT_IF_LTE, maxsize, NULL, pixClone(), pixConnComp(), pixDestroy(), pixReduceRankBinaryCascade(), pixWordMaskByDilation(), and PROCNAME.

Referenced by main().

NUMAA* boxaExtractSortedPattern	(	BOXA *	boxa,
		NUMA *	na
	)

boxaExtractSortedPattern()

Input: boxa (typ. of word bounding boxes, in textline order) numa (index of textline for each box in boxa) Return: naa (numaa, where each numa represents one textline), or null on error

Notes: (1) The input is expected to come from pixGetWordBoxesInTextlines(). (2) Each numa in the output consists of an average y coordinate of the first box in the textline, followed by pairs of x coordinates representing the left and right edges of each of the boxes in the textline.

Definition at line 563 of file classapp.c.

References boxaGetBox(), boxaGetCount(), boxDestroy(), boxGetGeometry(), ERROR_PTR, CCBorda::h, L_CLONE, L_INSERT, NULL, numaaAddNuma(), numaaCreate(), numaAddNumber(), numaCreate(), numaGetIValue(), PROCNAME, and CCBorda::w.

Referenced by main().

l_int32 numaaCompareImagesByBoxes	(	NUMAA *	naa1,
		NUMAA *	naa2,
		l_int32	nperline,
		l_int32	nreq,
		l_int32	maxshiftx,
		l_int32	maxshifty,
		l_int32	delx,
		l_int32	dely,
		l_int32 *	psame,
		l_int32	debugflag
	)

numaaCompareImagesByBoxes()

Input: naa1 (for image 1, formatted by boxaExtractSortedPattern()) naa2 (ditto; for image 2) nperline (number of box regions to be used in each textline) nreq (number of complete row matches required) maxshiftx (max allowed x shift between two patterns, in pixels) maxshifty (max allowed y shift between two patterns, in pixels) delx (max allowed difference in x data, after alignment) dely (max allowed difference in y data, after alignment) &same (<return> 1 if row matches are found; 0 otherwise) debugflag (1 for debug output) Return: 0 if OK, 1 on error

Notes: (1) Each input numaa describes a set of sorted bounding boxes (sorted by textline and, within each textline, from left to right) in the images from which they are derived. See boxaExtractSortedPattern() for a description of the data format in each of the input numaa. (2) This function does an alignment between the input descriptions of bounding boxes for two images. The input parameter specifies the number of boxes to consider in each line when testing for a match, and is the required number of lines that must be well-aligned to get a match. (3) Testing by alignment has 3 steps: (a) Generating the location of word bounding boxes from the images (prior to calling this function). (b) Listing all possible pairs of aligned rows, based on tolerances in horizontal and vertical positions of the boxes. Specifically, all pairs of rows are enumerated whose first boxes can be brought into close alignment, based on the delx parameter for boxes in the line and within the overall the and constraints. (c) Each pair, starting with the first, is used to search for a set of - 1 other pairs that can all be aligned with a difference in global translation of not more than (, ).

Definition at line 649 of file classapp.c.

References CALLOC, countAlignedMatches(), ERROR_INT, FREE, L_ABS, L_CLONE, numaAddNumber(), numaaGetCount(), numaaGetNuma(), numaCreate(), numaDestroy(), numaGetCount(), numaGetIValue(), PROCNAME, testLineAlignmentX(), y1, and y2.

Referenced by main().

Variable Documentation

const l_int32 JB_WORDS_MIN_WIDTH = 5 [static]

Definition at line 43 of file classapp.c.

Referenced by jbWordsInTextlines().

const l_int32 JB_WORDS_MIN_HEIGHT = 3 [static]

Definition at line 44 of file classapp.c.

Referenced by jbWordsInTextlines().

classapp.c File Reference

Defines

Functions

Variables

Detailed Description

Define Documentation

Function Documentation

Variable Documentation