I am trying to convert HTML5 file to docx using docx4j. The bigger picture is that the HTML contains Arabic data and English data. I have set styling on elements in my HTML. My HTML looks neat on chrome but when I convert to docx using docx4j, arabic text formatting is lost. On MS word, it shows that my Arabic text has bold style set, but it isn't bold. Similarly, RTL directions are also lost. Tables are reversed from RTL to LTR. As a workaround, I used BufferedWriter to generate .doc file, which matched my HTML file with styling attributes but there is Base64 image present in the html, which doesn't appear in the .doc file. Hence, the need to convert to .docx format. My requirement is an editable document generated from my HTML. Please guide me through as I have been scratching my head. No source example codes are working as well.
Here is the code I am using to convert HTML to docx.
public boolean convertHTMLToDocx(String inputFilePath, String outputFilePath, boolean headerFlag,
boolean footerFlag,String orientation, String logoPath, String margin, JSONObject json,boolean isArabic) {
boolean conversionFlag;
boolean orientationFlag = false;
try {
if(!orientation.equalsIgnoreCase("Y")){
orientationFlag = true;
}
String stringFromFile = FileUtils.readFileToString(new File(inputFilePath), "UTF-8");
String unescaped = stringFromFile;
WordprocessingMLPackage wordMLPackage = WordprocessingMLPackage.createPackage();
NumberingDefinitionsPart ndp = new NumberingDefinitionsPart();
wordMLPackage.getMainDocumentPart().addTargetPart(ndp);
ndp.unmarshalDefaultNumbering();
ImportXHTMLProperties.setProperty("docx4j-ImportXHTML.Bidi.Heuristic", true);
ImportXHTMLProperties.setProperty("docx4j-ImportXHTML.Element.Heading.MapToStyle", true);
ImportXHTMLProperties.setProperty("docx4j-ImportXHTML.fonts.default.serif", "Frutiger LT Arabic 45 Light");
ImportXHTMLProperties.setProperty("docx4j-ImportXHTML.fonts.default.sans-serif", "Frutiger LT Arabic 45 Light");
ImportXHTMLProperties.setProperty("docx4j-ImportXHTML.fonts.default.monospace", "Frutiger LT Arabic 45 Light");
XHTMLImporterImpl xHTMLImporter = new XHTMLImporterImpl(wordMLPackage);
xHTMLImporter.setHyperlinkStyle("Hyperlink");
xHTMLImporter.setParagraphFormatting(FormattingOption.CLASS_PLUS_OTHER);
xHTMLImporter.setTableFormatting(FormattingOption.CLASS_PLUS_OTHER);
xHTMLImporter.setRunFormatting(FormattingOption.CLASS_PLUS_OTHER);
wordMLPackage.getMainDocumentPart().getContent().addAll(xHTMLImporter.convert(unescaped, ""));
XmlUtils.marshaltoString(wordMLPackage.getMainDocumentPart().getJaxbElement(),true,true);
File output = new File(outputFilePath);
wordMLPackage.save(output);
Console.log("file path where it is stored is" + " " + output.getAbsolutePath());
if (headerFlag || footerFlag) {
File file = new File(outputFilePath);
InputStream in = new FileInputStream(file);
wordMLPackage = WordprocessingMLPackage.load(in);
if (headerFlag) {
// set Header
}
if (footerFlag) {
// set Footer
}
wordMLPackage.save(file);
Console.log("Finished editing the word document");
}
conversionFlag = true;
} catch (InvalidFormatException e) {
Error.log("Invalid format found:-" + getStackTrace(e));
conversionFlag = false;
} catch (Exception e) {
Error.log("Error while converting:-" + getStackTrace(e));
conversionFlag = false;
}
return conversionFlag;
}