Parsing XHTML results from Bing

Question

i am trying to parse received search queries from bing search engines which are received in xhtml in java. I am using sax XmlReader to read the results but i keep on getting errors. here is my code-this one is for the hadler of the reader:

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;


public class XHTMLHandler extends DefaultHandler{
    public XHTMLHandler()
    {
        super();
    }
    public void startDocument ()
    {
        System.out.println("Start document");
    }
    public void endDocument ()
    {
    System.out.println("End document");
    }
    public void startElement (String uri, String name,String qName, Attributes atts)
    {
        if ("".equals (uri))
                System.out.println("Start element: " + qName);
            else
                System.out.println("Start element: {" + uri + "}" + name);
    }

    public void endElement (String uri, String name, String qName)
    {
    if ("".equals (uri))
        System.out.println("End element: " + qName);
    else
        System.out.println("End element:   {" + uri + "}" + name);
    }
    public void startPrefixMapping (String prefix, String uri)
      throws SAXException {
    }
    public void endPrefixMapping (String prefix)
      throws SAXException {
    }



    public void characters (char ch[], int start, int length)
        {
        System.out.print("Characters:    \"");
        for (int i = start; i < start + length; i++) {
            switch (ch[i]) {
            case '\\':
            System.out.print("\\\\");
            break;
            case '"':
            System.out.print("\\\"");
            break;
            case '\n':
            System.out.print("\\n");
            break;
            case '\r':
            System.out.print("\\r");
            break;
            case '\t':
            System.out.print("\\t");
            break;
            default:
            System.out.print(ch[i]);
            break;
            }
        }
        System.out.print("\"\n");
        }

}

and this is the program itself:

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.HttpRetryException;
import java.net.HttpURLConnection;
import java.net.URL;

import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;


public class Searching {
    private String m_urlBingSearch  = "http://www.bing.com/search?q=";
    private HttpURLConnection m_httpCon;
    private OutputStreamWriter m_streamWriter;
    //private BufferedReader m_bufferReader;
    private URL m_serverAdress;
    private StringBuilder sb;
    private String m_line;
    private InputSource m_inputSrc;
    public Searching()
    {

        m_httpCon = null;
        m_streamWriter = null;
        //m_bufferReader = null;
        m_serverAdress = null;
        sb = null;
        m_line = new String();
    }
    public void SearchBing(String searchPrms) throws SAXException,IOException 
    {


            //set up connection
            sb = new StringBuilder();
            sb.append(m_urlBingSearch);
            sb.append(searchPrms);
            m_serverAdress = new URL(sb.toString());
            m_httpCon = (HttpURLConnection)m_serverAdress.openConnection();
            m_httpCon.setRequestMethod("GET");
            m_httpCon.setDoOutput(true);
            m_httpCon.setConnectTimeout(10000);
            m_httpCon.connect();
            //m_streamWriter = new OutputStreamWriter(m_httpCon.getOutputStream());
            //m_bufferReader = new BufferedReader(new InputStreamReader(m_httpCon.getInputStream()));
            XMLReader reader = XMLReaderFactory.createXMLReader();
            XHTMLHandler handle = new XHTMLHandler();
            reader.setContentHandler(handle);
            reader.setErrorHandler(handle);
            //reader.startPrefixMapping("html", "http://www.w3.org/1999/xhtml");
            handle.startPrefixMapping("html", "http://www.w3.org/1999/xhtml");
            m_inputSrc = new InputSource(m_httpCon.getInputStream());
            reader.parse(m_inputSrc);
            m_httpCon.disconnect();


    }
    public static void main(String [] args) throws SAXException,IOException
    {
        Searching s = new Searching();
        s.SearchBing("beatles");
    }
}

this is my error message:

Exception in thread "main" java.io.IOException: Server returned HTTP response code: 503 for URL: http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd
    at sun.net.www.protocol.http.HttpURLConnection.getInputStream(Unknown Source)
    at com.sun.org.apache.xerces.internal.impl.XMLEntityManager.setupCurrentEntity(Unknown Source)
    at com.sun.org.apache.xerces.internal.impl.XMLEntityManager.startEntity(Unknown Source)
    at com.sun.org.apache.xerces.internal.impl.XMLEntityManager.startDTDEntity(Unknown Source)
    at com.sun.org.apache.xerces.internal.impl.XMLDTDScannerImpl.setInputSource(Unknown Source)
    at com.sun.org.apache.xerces.internal.impl.XMLDocumentScannerImpl$DTDDriver.dispatch(Unknown Source)
    at com.sun.org.apache.xerces.internal.impl.XMLDocumentScannerImpl$DTDDriver.next(Unknown Source)
    at com.sun.org.apache.xerces.internal.impl.XMLDocumentScannerImpl$PrologDriver.next(Unknown Source)
    at com.sun.org.apache.xerces.internal.impl.XMLDocumentScannerImpl.next(Unknown Source)
    at com.sun.org.apache.xerces.internal.impl.XMLNSDocumentScannerImpl.next(Unknown Source)
    at com.sun.org.apache.xerces.internal.impl.XMLDocumentFragmentScannerImpl.scanDocument(Unknown Source)
    at com.sun.org.apache.xerces.internal.parsers.XML11Configuration.parse(Unknown Source)
    at com.sun.org.apache.xerces.internal.parsers.XML11Configuration.parse(Unknown Source)
    at com.sun.org.apache.xerces.internal.parsers.XMLParser.parse(Unknown Source)
    at com.sun.org.apache.xerces.internal.parsers.AbstractSAXParser.parse(Unknown Source)
    at Searching.SearchBing(Searching.java:57)
    at Searching.main(Searching.java:65)

can someone please help? i think it has something to do with dtd but i don't know hot to fix it

Doesn't Bing have some kind of webservice you can use, instead of screen scraping their HTML? — Esteban Küber
@voyager: I concur. I think this may be what you're referring to: msdn.microsoft.com/en-us/library/dd900818.aspx — Adam Paynter

bobince bobince · Accepted Answer · 2010-05-05T18:37:53

Server returned HTTP response code: 503 for URL: http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd

Evidently you're trying to parse an XHTML document using an external-entity-fetching parser. It's dragging in the DTD external subset so it can read any declarations for HTML-specific entities like   or é.

You're getting an HTTP 503 from the w3.org server hosting that DTD external subset at the moment, but even if you weren't it'd still be highly impolite to bombard that server with requests for the DTD every time you do a scrape. (Maybe they're blocking you, for that very reason?)

You could create an EntityResolver to return your own local copy of the DTD, or a pared-down version that only includes the entity definitions. Or you can ask the reader not to fetch the DTD at all, by using setFeature to turn that option off, if the XMLReader implementation you have supports that feature. (eg. for Xerxes.) Though then you might get in trouble if the document contains non-builtin entity references like  .

Also since this is a live web page being served as text/html, and especially because it comes from Microsoft, it's probably quite optimistic to assume it will remain well-formed! Screen scraping is usually best done with a parser that's tolerant of HTML quirks. But as the comments above state, using an API is much better bet than screen-scraping in any case.

Parsing XHTML results from Bing

2 Answers