Friday, March 17, 2006

Parsing HTML using tidy and tidylib

It's so hard to find a C program on the web that can parse HTML! Yes, you can find parsers written in Perl and other languages, but not C!

So I might as well share what I've learnt so far. I am making the 7DS HTML parser in libxml, but I experimented using tidy and tidylib as well, and here's how the code for that looks:


#include <tidy.h&rt;
#include <buffio.h&rt;
#include <stdio.h&rt;
#include <errno.h&rt;

/**
* Dump the list of nodes and their attributes
* Modified from tidylib documentation
*/
void dumpNode( TidyNode tnod, int indent )
{
TidyNode child;

for ( child = tidyGetChild(tnod); child; child = tidyGetNext(child) )
{
ctmbstr name = tidyNodeGetName( child );
if ( !name )
{
switch ( tidyNodeGetType(child) )
{
case TidyNode_Root: name = "Root"; break;
case TidyNode_DocType: name = "DOCTYPE"; break;
case TidyNode_Comment: name = "Comment"; break;
case TidyNode_ProcIns: name = "Processing Instruction"; break;
case TidyNode_Text: name = "Text"; break;
case TidyNode_CDATA: name = "CDATA"; break;
case TidyNode_Section: name = "XML Section"; break;
case TidyNode_Asp: name = "ASP"; break;
case TidyNode_Jste: name = "JSTE"; break;
case TidyNode_Php: name = "PHP"; break;
case TidyNode_XmlDecl: name = "XML Declaration"; break;

case TidyNode_Start:
case TidyNode_End:
case TidyNode_StartEnd:
default:
assert( name != NULL ); // Shouldn't get here
break;
}
}
assert( name != NULL );
char whitespace[indent];
memset (whitespace, ' ', indent);
whitespace[indent-1] = '\0';
// printf( "%sNode: %s\n", whitespace, name );

/* Get the first attribute for all nodes */
TidyAttr tattr = tidyAttrFirst (child);
while (tattr != NULL) {
/* Print the node and its attribute */
printf ("%s %s %s= %s\n", whitespace, tidyNodeGetName (child), tidyAttrName (tattr), tidyAttrValue (tattr));
/* Get the next attribute */
tattr = tidyAttrNext (tattr);
}
dumpNode( child, indent + 4 );
}
}

/* Dump the whole document */
void dumpDoc( TidyDoc tdoc )
{
dumpNode( tidyGetRoot(tdoc), 0 );
}

/* Dump only the body */
void dumpBody( TidyDoc tdoc )
{
dumpNode( tidyGetBody(tdoc), 0 );
}

int main(int argc, char **argv )
{
/* Input file: Either the first argument or "../test.html" */
const char* input = (argc > 1) ? argv[1] : "../test.html";
TidyBuffer output = {0};
TidyBuffer errbuf = {0};
int rc = -1;
Bool ok;

TidyDoc tdoc = tidyCreate(); // Initialize "document"
printf( "Tidying:\t%s\n", input );

ok = tidyOptSetBool( tdoc, TidyXhtmlOut, yes ); // Convert to XHTML
if ( ok )
rc = tidySetErrorBuffer( tdoc, &errbuf ); // Capture diagnostics
if ( rc >= 0 )
/* Read from the HTML file */
rc = tidyParseFile( tdoc, input ); // Parse the input
if ( rc >= 0 )
rc = tidyCleanAndRepair( tdoc ); // Tidy it up!
if ( rc >= 0 )
rc = tidyRunDiagnostics( tdoc ); // Kvetch
if ( rc > 1 ) // If error, force output.
rc = ( tidyOptSetBool(tdoc, TidyForceOutput, yes) ? rc : -1 );
if ( rc >= 0 )
rc = tidySaveBuffer( tdoc, &output ); // Pretty Print

if ( rc >= 0 )
{
if ( rc > 0 )
printf( "\nDiagnostics:\n\n%s", errbuf.bp );
printf( "\nAnd here is the result:\n\n%s", output.bp );
}
else
printf( "A severe error (%d) occurred.\\n", rc );

tidyBufFree( &output );
tidyBufFree( &errbuf );

/* Now parse and print the tags in the HTML document */
dumpDoc (tdoc);

tidyRelease( tdoc );
return rc;
}

No comments: