Whole document tree
    

Whole document tree

HTMLparser

HTMLparser

Name

HTMLparser -- 

Synopsis



typedef     htmlParserCtxt;
typedef     htmlParserCtxtPtr;
typedef     htmlParserNodeInfo;
typedef     htmlSAXHandler;
typedef     htmlSAXHandlerPtr;
typedef     htmlParserInput;
typedef     htmlParserInputPtr;
typedef     htmlDocPtr;
typedef     htmlNodePtr;
struct      htmlElemDesc;
typedef     htmlElemDescPtr;
struct      htmlEntityDesc;
typedef     htmlEntityDescPtr;
htmlElemDescPtr htmlTagLookup               (const xmlChar *tag);
htmlEntityDescPtr htmlEntityLookup          (const xmlChar *name);
int         htmlIsAutoClosed                (htmlDocPtr doc,
                                             htmlNodePtr elem);
int         htmlAutoCloseTag                (htmlDocPtr doc,
                                             const xmlChar *name,
                                             htmlNodePtr elem);
htmlEntityDescPtr htmlParseEntityRef        (htmlParserCtxtPtr ctxt,
                                             xmlChar **str);
int         htmlParseCharRef                (htmlParserCtxtPtr ctxt);
void        htmlParseElement                (htmlParserCtxtPtr ctxt);
htmlDocPtr  htmlSAXParseDoc                 (xmlChar *cur,
                                             const char *encoding,
                                             htmlSAXHandlerPtr sax,
                                             void *userData);
htmlDocPtr  htmlParseDoc                    (xmlChar *cur,
                                             const char *encoding);
htmlDocPtr  htmlSAXParseFile                (const char *filename,
                                             const char *encoding,
                                             htmlSAXHandlerPtr sax,
                                             void *userData);
htmlDocPtr  htmlParseFile                   (const char *filename,
                                             const char *encoding);
void        htmlFreeParserCtxt              (htmlParserCtxtPtr ctxt);
htmlParserCtxtPtr htmlCreatePushParserCtxt  (htmlSAXHandlerPtr sax,
                                             void *user_data,
                                             const char *chunk,
                                             int size,
                                             const char *filename,
                                             xmlCharEncoding enc);
int         htmlParseChunk                  (htmlParserCtxtPtr ctxt,
                                             const char *chunk,
                                             int size,
                                             int terminate);

Description

Details

htmlParserCtxt

typedef xmlParserCtxt htmlParserCtxt;


htmlParserCtxtPtr

typedef xmlParserCtxtPtr htmlParserCtxtPtr;


htmlParserNodeInfo

typedef xmlParserNodeInfo htmlParserNodeInfo;


htmlSAXHandler

typedef xmlSAXHandler htmlSAXHandler;


htmlSAXHandlerPtr

typedef xmlSAXHandlerPtr htmlSAXHandlerPtr;


htmlParserInput

typedef xmlParserInput htmlParserInput;


htmlParserInputPtr

typedef xmlParserInputPtr htmlParserInputPtr;


htmlDocPtr

typedef xmlDocPtr htmlDocPtr;


htmlNodePtr

typedef xmlNodePtr htmlNodePtr;


struct htmlElemDesc

struct htmlElemDesc {
    const char *name;	/* The tag name */
    int startTag;       /* Whether the start tag can be implied */
    int endTag;         /* Whether the end tag can be implied */
    int empty;          /* Is this an empty element ? */
    int depr;           /* Is this a deprecated element ? */
    int dtd;            /* 1: only in Loose DTD, 2: only Frameset one */
    const char *desc;   /* the description */
};


htmlElemDescPtr

typedef htmlElemDesc *htmlElemDescPtr;


struct htmlEntityDesc

struct htmlEntityDesc {
    int value;		/* the UNICODE value for the character */
    const char *name;	/* The entity name */
    const char *desc;   /* the description */
};


htmlEntityDescPtr

typedef htmlEntityDesc *htmlEntityDescPtr;


htmlTagLookup ()

htmlElemDescPtr htmlTagLookup               (const xmlChar *tag);

Lookup the HTML tag in the ElementTable

tag : The tag name
Returns :the related htmlElemDescPtr or NULL if not found.


htmlEntityLookup ()

htmlEntityDescPtr htmlEntityLookup          (const xmlChar *name);

Lookup the given entity in EntitiesTable

TODO: the linear scan is really ugly, an hash table is really needed.

name : the entity name
Returns :the associated htmlEntityDescPtr if found, NULL otherwise.


htmlIsAutoClosed ()

int         htmlIsAutoClosed                (htmlDocPtr doc,
                                             htmlNodePtr elem);

The HTmL DtD allows a tag to implicitely close other tags. The list is kept in htmlStartClose array. This function checks if a tag is autoclosed by one of it's child

doc : the HTML document
elem : the HTML element
Returns :1 if autoclosed, 0 otherwise


htmlAutoCloseTag ()

int         htmlAutoCloseTag                (htmlDocPtr doc,
                                             const xmlChar *name,
                                             htmlNodePtr elem);

The HTmL DtD allows a tag to implicitely close other tags. The list is kept in htmlStartClose array. This function checks if the element or one of it's children would autoclose the given tag.

doc : the HTML document
name : The tag name
elem : the HTML element
Returns :1 if autoclose, 0 otherwise


htmlParseEntityRef ()

htmlEntityDescPtr htmlParseEntityRef        (htmlParserCtxtPtr ctxt,
                                             xmlChar **str);

parse an HTML ENTITY references

[68] EntityRef ::= '&' Name ';'

ctxt : an HTML parser context
str : location to store the entity name
Returns :the associated htmlEntityDescPtr if found, or NULL otherwise, if non-NULL *str will have to be freed by the caller.


htmlParseCharRef ()

int         htmlParseCharRef                (htmlParserCtxtPtr ctxt);

parse Reference declarations

[66] CharRef ::= '&#' [0-9]+ ';' | '&x' [0-9a-fA-F]+ ';'

ctxt : an HTML parser context
Returns :the value parsed (as an int)


htmlParseElement ()

void        htmlParseElement                (htmlParserCtxtPtr ctxt);

parse an HTML element, this is highly recursive

[39] element ::= EmptyElemTag | STag content ETag

[41] Attribute ::= Name Eq AttValue

ctxt : an HTML parser context


htmlSAXParseDoc ()

htmlDocPtr  htmlSAXParseDoc                 (xmlChar *cur,
                                             const char *encoding,
                                             htmlSAXHandlerPtr sax,
                                             void *userData);

parse an HTML in-memory document and build a tree. It use the given SAX function block to handle the parsing callback. If sax is NULL, fallback to the default DOM tree building routines.

cur : a pointer to an array of xmlChar
encoding : a free form C string describing the HTML document encoding, or NULL
sax : the SAX handler block
userData : if using SAX, this pointer will be provided on callbacks.
Returns :the resulting document tree


htmlParseDoc ()

htmlDocPtr  htmlParseDoc                    (xmlChar *cur,
                                             const char *encoding);

parse an HTML in-memory document and build a tree.

cur : a pointer to an array of xmlChar
encoding : a free form C string describing the HTML document encoding, or NULL
Returns :the resulting document tree


htmlSAXParseFile ()

htmlDocPtr  htmlSAXParseFile                (const char *filename,
                                             const char *encoding,
                                             htmlSAXHandlerPtr sax,
                                             void *userData);

parse an HTML file and build a tree. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time. It use the given SAX function block to handle the parsing callback. If sax is NULL, fallback to the default DOM tree building routines.

filename : the filename
encoding : a free form C string describing the HTML document encoding, or NULL
sax : the SAX handler block
userData : if using SAX, this pointer will be provided on callbacks.
Returns :the resulting document tree


htmlParseFile ()

htmlDocPtr  htmlParseFile                   (const char *filename,
                                             const char *encoding);

parse an HTML file and build a tree. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time.

filename : the filename
encoding : a free form C string describing the HTML document encoding, or NULL
Returns :the resulting document tree


htmlFreeParserCtxt ()

void        htmlFreeParserCtxt              (htmlParserCtxtPtr ctxt);

Free all the memory used by a parser context. However the parsed document in ctxt->myDoc is not freed.

ctxt : an HTML parser context


htmlCreatePushParserCtxt ()

htmlParserCtxtPtr htmlCreatePushParserCtxt  (htmlSAXHandlerPtr sax,
                                             void *user_data,
                                             const char *chunk,
                                             int size,
                                             const char *filename,
                                             xmlCharEncoding enc);

Create a parser context for using the HTML parser in push mode To allow content encoding detection, size should be >= 4 The value of filename is used for fetching external entities and error/warning reports.

sax : a SAX handler
user_data : The user data returned on SAX callbacks
chunk : a pointer to an array of chars
size : number of chars in the array
filename : an optional file name or URI
enc : an optional encoding
Returns :the new parser context or NULL


htmlParseChunk ()

int         htmlParseChunk                  (htmlParserCtxtPtr ctxt,
                                             const char *chunk,
                                             int size,
                                             int terminate);

Parse a Chunk of memory

ctxt : an XML parser context
chunk : an char array
size : the size in byte of the chunk
terminate : last chunk indicator
Returns :zero if no error, the xmlParserErrors otherwise.