I have one problem with parsing *.docx document with OpenXML (C#).
So, here's my steps:
1. Load *.docx document
2. Recieve list of paragraphs
3. In each paragraph look for text,image and table elements
4. For each text and image element create html tags
5. Save output as *.html file
I've found out how to locate image file in document and extract it. Now there's one step to go - find where is table position in text (paragraph).
If any one know how to locate table in *.docx document using OpenXML please help. Thanks.
Additional: Ok, may be I'm not clear explain what i mean. If we get content of paragraph, you can find chield objects as textblocks, pictures and so on. So, if paragraph contains Run that contains Picture it means that in this place in Word document placed image.
Sample of my function:
public static string ParseDocxDocument(string pathToFile)
{
StringBuilder result = new StringBuilder();
WordprocessingDocument wordProcessingDoc = WordprocessingDocument.Open(pathToFile, true);
List<ImagePart> imgPart = wordProcessingDoc.MainDocumentPart.ImageParts.ToList();
IEnumerable<Paragraph> paragraphElement = wordProcessingDoc.MainDocumentPart.Document.Descendants<Paragraph>();
int imgCounter = 0;
foreach (Paragraph par in paragraphElement)
{
//Add new paragraph tag
result.Append("<div style=\"width:100%; text-align:");
//Append anchor style
if (par.ParagraphProperties != null && par.ParagraphProperties.Justification != null)
switch (par.ParagraphProperties.Justification.Val.Value)
{
case JustificationValues.Left:
result.Append("left;");
break;
case JustificationValues.Center:
result.Append("center;");
break;
case JustificationValues.Both:
result.Append("justify;");
break;
case JustificationValues.Right:
default:
result.Append("right;");
break;
}
else
result.Append("left;");
//Append text decoration style
if (par.ParagraphProperties != null && par.ParagraphProperties.ParagraphMarkRunProperties != null && par.ParagraphProperties.ParagraphMarkRunProperties.HasChildren)
foreach (OpenXmlElement chield in par.ParagraphProperties.ParagraphMarkRunProperties.ChildElements)
{
switch (chield.GetType().Name)
{
case "Bold":
result.Append("font-weight:bold;");
break;
case "Underline":
result.Append("text-decoration:underline;");
break;
case "Italic":
result.Append("font-style:italic;");
break;
case "FontSize":
result.Append("font-size:" + ((FontSize)chield).Val.Value + "px;");
break;
default: break;
}
}
result.Append("\">");
//Add image tag
IEnumerable<Run> runs = par.Descendants<Run>();
foreach (Run run in runs)
{
if (run.HasChildren)
{
foreach (OpenXmlElement chield in run.ChildElements.Where(o => o.GetType().Name == "Picture"))
{
result.Append(string.Format("<img style=\"{1}\" src=\"data:image/jpeg;base64,{0}\" />", GetBase64Image(imgPart[imgCounter].GetStream()),
((DocumentFormat.OpenXml.Vml.Shape)chield.ChildElements.Where(o => o.GetType().Name == "Shape").FirstOrDefault()).Style
));
imgCounter++;
}
}
}
//Append inner text
IEnumerable<Text> textElement = par.Descendants<Text>();
if (par.Descendants<Text>().Count() == 0)
result.Append("<br />");
foreach (Text t in textElement)
{
result.Append(t.Text);
}
result.Append("</div>");
result.Append(Environment.NewLine);
}
wordProcessingDoc.Close();
return result.ToString();
}
Now I whant to specify table place in text (as it appear in Word).
Final:
Ok, everyone, I've found out. In my sample function one big mistake. I'm enumerate Paragraph elements of document Body. Tables are at same level as Paragraph, so function ignore tables. So we need to enumerate Elements of document Body.
Here's my test function to generate correct HTML from docx (it's just test code, so it's not clean)
public static string ParseDocxDocument(string pathToFile)
{
StringBuilder result = new StringBuilder();
WordprocessingDocument wordProcessingDoc = WordprocessingDocument.Open(pathToFile, true);
List<ImagePart> imgPart = wordProcessingDoc.MainDocumentPart.ImageParts.ToList();
List<string> tableCellContent = new List<string>();
IEnumerable<Paragraph> paragraphElement = wordProcessingDoc.MainDocumentPart.Document.Descendants<Paragraph>();
int imgCounter = 0;
foreach (OpenXmlElement section in wordProcessingDoc.MainDocumentPart.Document.Body.Elements<OpenXmlElement>())
{
if(section.GetType().Name == "Paragraph")
{
Paragraph par = (Paragraph)section;
//Add new paragraph tag
result.Append("<div style=\"width:100%; text-align:");
//Append anchor style
if (par.ParagraphProperties != null && par.ParagraphProperties.Justification != null)
switch (par.ParagraphProperties.Justification.Val.Value)
{
case JustificationValues.Left:
result.Append("left;");
break;
case JustificationValues.Center:
result.Append("center;");
break;
case JustificationValues.Both:
result.Append("justify;");
break;
case JustificationValues.Right:
default:
result.Append("right;");
break;
}
else
result.Append("left;");
//Append text decoration style
if (par.ParagraphProperties != null && par.ParagraphProperties.ParagraphMarkRunProperties != null && par.ParagraphProperties.ParagraphMarkRunProperties.HasChildren)
foreach (OpenXmlElement chield in par.ParagraphProperties.ParagraphMarkRunProperties.ChildElements)
{
switch (chield.GetType().Name)
{
case "Bold":
result.Append("font-weight:bold;");
break;
case "Underline":
result.Append("text-decoration:underline;");
break;
case "Italic":
result.Append("font-style:italic;");
break;
case "FontSize":
result.Append("font-size:" + ((FontSize)chield).Val.Value + "px;");
break;
default: break;
}
}
result.Append("\">");
//Add image tag
IEnumerable<Run> runs = par.Descendants<Run>();
foreach (Run run in runs)
{
if (run.HasChildren)
{
foreach (OpenXmlElement chield in run.ChildElements.Where(o => o.GetType().Name == "Picture"))
{
result.Append(string.Format("<img style=\"{1}\" src=\"data:image/jpeg;base64,{0}\" />", GetBase64Image(imgPart[imgCounter].GetStream()),
((DocumentFormat.OpenXml.Vml.Shape)chield.ChildElements.Where(o => o.GetType().Name == "Shape").FirstOrDefault()).Style
));
imgCounter++;
}
foreach (OpenXmlElement table in run.ChildElements.Where(o => o.GetType().Name == "Table"))
{
result.Append("<strong>HERE'S TABLE</strong>");
}
}
}
//Append inner text
IEnumerable<Text> textElement = par.Descendants<Text>();
if (par.Descendants<Text>().Count() == 0)
result.Append("<br />");
foreach (Text t in textElement.Where(o=>!tableCellContent.Contains(o.Text.Trim())))
{
result.Append(t.Text);
}
result.Append("</div>");
result.Append(Environment.NewLine);
}
else if (section.GetType().Name=="Table")
{
result.Append("<table>");
Table tab = (Table)section;
foreach (TableRow row in tab.Descendants<TableRow>())
{
result.Append("<tr>");
foreach (TableCell cell in row.Descendants<TableCell>())
{
result.Append("<td>");
result.Append(cell.InnerText);
tableCellContent.Add(cell.InnerText.Trim());
result.Append("</td>");
}
result.Append("</tr>");
}
result.Append("</table>");
}
}
wordProcessingDoc.Close();
return result.ToString();
}
private static string GetBase64Image(Stream inputData)
{
byte[] data = new byte[inputData.Length];
inputData.Read(data, 0, data.Length);
return Convert.ToBase64String(data);
}