2
votes

I wanted to extract values from text nodes from an xml input. I have got the following code from web as the official documentation of libxml has many broken links of which sax parser is one. Please help me obtain the value of a text node. in startElementNs when i tried to look for my text node, i get NULL. Appreciate any help here.

My xml looks like this:

<a>
   <b>
      <c> text values </c>
   </b>
</a>

My code looks like this:

#include <stdio.h>
#include <assert.h>
#include <memory.h>
#include <libxml/xmlmemory.h>
#include <libxml/parser.h>
#include <string>


class ParseFSM
{
public:
   /** SAX2 callback when an element start has been detected by the parser. It provides the namespace informations for the element, as well as the new namespace declarations on the element.
      ctx:  the user data (XML parser context)
      localname:  the local name of the element
      prefix:  the element namespace prefix if available
      URI:  the element namespace name if available
      nb_namespaces: number of namespace definitions on that node
      namespaces: pointer to the array of prefix/URI pairs namespace definitions
      nb_attributes: the number of attributes on that node
      nb_defaulted:  the number of defaulted attributes. The defaulted ones are at the end of the array
      attributes: pointer to the array of (localname/prefix/URI/value/end) attribute values.
      **/
  static void startElementNs (void *ctx,
                  const xmlChar * localname,
                  const xmlChar * prefix,
                  const xmlChar * URI,
                  int nb_namespaces,
                  const xmlChar ** namespaces,
                  int nb_attributes,
                  int nb_defaulted, const xmlChar ** attributes)
  {
    ParseFSM & fsm = *(static_cast < ParseFSM * >(ctx));
    printf ("startElementNs: name = '%s' prefix = '%s' uri = (%p)'%s'\n", localname, prefix, URI, URI);
    for (int indexNamespace = 0; indexNamespace < nb_namespaces; ++indexNamespace)
      {
        const xmlChar *prefix = namespaces[indexNamespace * 2];
        const xmlChar *nsURI = namespaces[indexNamespace * 2 + 1];
        printf ("  namespace: name='%s' uri=(%p)'%s'\n", prefix, nsURI, nsURI);
      }
    unsigned int index = 0;
    for (int indexAttribute = 0; indexAttribute < nb_attributes; ++indexAttribute, index += 5)
      {
        const xmlChar *localname = attributes[index];
        const xmlChar *prefix = attributes[index + 1];
        const xmlChar *nsURI = attributes[index + 2];
        const xmlChar *valueBegin = attributes[index + 3];
        const xmlChar *valueEnd = attributes[index + 4];
        std::string value ((const char *) valueBegin, (const char *) valueEnd);
        printf ("  %sattribute: localname='%s', prefix='%s', uri=(%p)'%s', value='%s'\n", indexAttribute >= (nb_attributes - nb_defaulted) ? "defaulted " : "", localname, prefix, nsURI, nsURI, value.c_str ());
      }
  }
   /** SAX2 callback when an element end has been detected by the parser. It provides the namespace informations for the element.
      ctx:  the user data (XML parser context)
      localname:  the local name of the element
      prefix:  the element namespace prefix if available
      URI:  the element namespace name if available
      **/
  static void endElementNs (void *ctx,
                const xmlChar * localname,
                const xmlChar * prefix, const xmlChar * URI)
  {
    ParseFSM & fsm = *(static_cast < ParseFSM * >(ctx));
    printf ("endElementNs: name = '%s' prefix = '%s' uri = '%s'\n", localname,
        prefix, URI);
  }
   /** Display and format an error messages, callback.
      ctx:  an XML parser context
      msg:  the message to display/transmit
      ...:  extra parameters for the message display
      */
  static void error (void *ctx, const char *msg, ...)
  {
    ParseFSM & fsm = *(static_cast < ParseFSM * >(ctx));
    va_list args;
    va_start (args, msg);
    vprintf (msg, args);
    va_end (args);
  }

   /** Display and format a warning messages, callback.
      ctx:  an XML parser context
      msg:  the message to display/transmit
      ...:  extra parameters for the message display
      */
  static void warning (void *ctx, const char *msg, ...)
  {
    ParseFSM & fsm = *(static_cast < ParseFSM * >(ctx));
    va_list args;
    va_start (args, msg);
    vprintf (msg, args);
    va_end (args);
  }
};
int
main (int argc, const char *argv[])
{
  std::string xmlIn = "<a><b><c> text values </c> </b> </a>"
  /*
   * this initialize the library and check potential ABI mismatches
   * between the version it was compiled for and the actual shared
   * library used.
   */
  LIBXML_TEST_VERSION xmlSAXHandler saxHandler; // See http://xmlsoft.org/html/libxml-tree.html#xmlSAXHandler
  memset (&saxHandler, 0, sizeof (saxHandler));
  // Using xmlSAXVersion( &saxHandler, 2 ) generate crash as it sets plenty of other pointers...
  saxHandler.initialized = XML_SAX2_MAGIC;  // so we do this to force parsing as SAX2.
  saxHandler.startElementNs = &ParseFSM::startElementNs;
  saxHandler.endElementNs = &ParseFSM::endElementNs;
  saxHandler.warning = &ParseFSM::warning;
  saxHandler.error = &ParseFSM::error;

  ParseFSM fsm;
  int result =
    xmlSAXUserParseMemory (&saxHandler, &fsm, xmlIn.c_str (),
               int (xmlIn.length ()));
  if (result != 0)
    {
      printf ("Failed to parse document.\n");
      return 1;
    }

  /*
   * Cleanup function for the XML library.
   */
  xmlCleanupParser ();
  /*
   * this is to debug memory for regression tests
   */
  xmlMemoryDump ();

  return 0;
}
1
There are no text nodes in a SAX parser. Only DOM parsers have nodes. Maybe you aren't using the right kind of parser? - john
If you want to get the text out of your XML document using a SAX parser then you need to look at the characters call back, jamesh.id.au/articles/libxml-sax/libxml-sax.html#characters - john
Thanks John, but en.wikipedia.org/wiki/Simple_API_for_XML says any sax parser generated events for start/end element nodes and text nodes as well. I will look at characters callback. - gyro
Look at the documentation - 'Note that the character data is not necessarily nul terminated.' You have to use the len parameter to determine how many characters there are. - john
@john I have implemented characters function. But it is appending tags also. is this behavior expected? The following is my code. link - gyro

1 Answers

1
votes
  1. You need to use characters callback

    void characters( void * user_data, const xmlChar * ch, int len);

  2. Strings are not null terminated, u need to use ch,len to determine the string

  3. Another problem with this call back is it can be called multiple times in between start and end element. So u cant blindly assume what you get in call back is string in between the tag. You may need to use string builder or some thing to collect the strings.

In your callback, you will probably want to copy the characters to some other buffer so that it can be used from the endElement callback. To optimise this callback a bit, you might adjust the callback so that it only copies the characters if the parser is in a certain state. Note that the characters callback may be called more than once between calls to startElement and endElement.

Hope this answers you, even if its late others might get help