I have the following C# console app code that uses the HTMLAgilityPack to analyze some sample HTML:
static void Main(string[] args)
{
string input = @"
<span style=""font-style: italic"">This is the title</span>.
This is the introductory text:
<ol>
<li>List Item One</li>
<li>List Item Two</li>
<li>List Item Three</li>
<li>This list item is nested:
<ol>
<li>List Item Four A.</li>
<li>List Item Four B.</li>
</ol>
Yes it is.
</li>
<li>List Item Five</li>
</ol>
This is the footer text. Last updated: July 20, 2014
";
HtmlDocument doc = new HtmlDocument();
try
{
doc.LoadHtml(input);
}
catch (Exception e)
{
LogIt("ERROR: " + e.Message);
return;
}
HtmlNode get_title = doc.DocumentNode.SelectSingleNode("//span");
if (get_title != null)
{
LogIt("Title: '" + get_title.InnerHtml + "'");
}
HtmlNodeCollection get_outer_lists = doc.DocumentNode.SelectNodes("//ol//li");
if (get_outer_lists != null)
{
foreach (HtmlNode hn_outer in get_outer_lists)
{
LogIt("Begin outer for");
LogIt("outer HTML: '" + hn_outer.OuterHtml + "'");
// Now fetch inner list, the text above the inner list, and the
// text below the inner list.
HtmlNodeCollection get_inner_lists = doc.DocumentNode.SelectNodes("//ol//li//ol//li");
if (get_inner_lists != null)
{
foreach (HtmlNode hn_inner in get_inner_lists)
{
LogIt("\tinner HTML: '" + hn_inner.OuterHtml + "'");
}
}
else
{
LogIt("ERROR: Could not get inner list");
}
}
}
else
{
LogIt("ERROR: Could not select //ol//li");
Console.Read();
return;
}
Console.Read();
return;
}
private static void LogIt(string str)
{
Console.WriteLine(str);
return;
}
...and this is the output:
Title: 'This is the title'
Begin outer for
outer HTML: '<li>List Item One</li>'
inner HTML: '<li>List Item Four A.</li>'
inner HTML: '<li>List Item Four B.</li>'
Begin outer for
outer HTML: '<li>List Item Two</li>'
inner HTML: '<li>List Item Four A.</li>'
inner HTML: '<li>List Item Four B.</li>'
Begin outer for
outer HTML: '<li>List Item Three</li>'
inner HTML: '<li>List Item Four A.</li>'
inner HTML: '<li>List Item Four B.</li>'
Begin outer for
outer HTML: '<li>This list item is nested:
<ol>
<li>List Item Four A.</li>
<li>List Item Four B.</li>
</ol>
Yes it is.
</li>'
inner HTML: '<li>List Item Four A.</li>'
inner HTML: '<li>List Item Four B.</li>'
Begin outer for
outer HTML: '<li>List Item Four A.</li>'
inner HTML: '<li>List Item Four A.</li>'
inner HTML: '<li>List Item Four B.</li>'
Begin outer for
outer HTML: '<li>List Item Four B.</li>'
inner HTML: '<li>List Item Four A.</li>'
inner HTML: '<li>List Item Four B.</li>'
Begin outer for
outer HTML: '<li>List Item Five</li>'
inner HTML: '<li>List Item Four A.</li>'
inner HTML: '<li>List Item Four B.</li>'
- I can get the title text just fine, but how do I get the introductory text or the footer? They don't belong to an HTML element I can select.
- The outer foreach loop iterates through both the outer and the inner ordered lists. How do I change the XPath string so that the outer for loop only iterates through the outer list? The inner for loop should take care of the inner list.