HTML-to-DOCX conversion

As a further development of HTML to RTF conversion feature, we are pleased to introduce the ability to create DOCX documents from styled HTML templates using PD4ML.

PD4ML can convert from HTML to DOCX the following elements:

  • Page margins
  • Text styles and fonts
  • Page, text, paragraph and table cell backgrounds
  • Text indentation
  • Ordered and unordered lists (right-to-left Arabic and Hebrew direction support is coming soon)
  • Tables (with correct table nesting). It supports col- and row-spans,
    table and cell backgrounds, cell paddings, border styles.
  • Images
  • Hyperlinks (external and internal)
  • Complex headers / footers (i.e. including tables). There is a possibility to define individual header
    and footer for title page.
  • Forced page breaks

The HTML to DOCX conversion can be triggered by the following API calls:

// read and parse HTML
pd4ml.readHTML(inputStream);
pd4ml.writeDOCX(outputStream);
pd4ml.outputFormat(PD4Constants.DOCX);
pd4ml.render(inputStream, outputStream);

The equivalents in JSP taglib:

<pd4tl:transform ... outputFormat="docx"> ... </pd4tl:transform>
<pd4ml:transform ... outputFormat="docx"> ... </pd4ml:transform>

java -Xmx512m -Djava.awt.headless=true -jar ./pd4ml.jar <URL> 1200 -out doc.docx -outformat docx
java -Xmx512m -Djava.awt.headless=true -cp ./pd4ml.jar Pd4Cmd <URL> 1200 -out doc.docx -outformat docx
 rtfwmf

Full converter Java application examples:


package samples;

import java.awt.Insets;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.security.InvalidParameterException;

import com.pd4ml.Dimensions.Units;
import com.pd4ml.PD4ML;
import com.pd4ml.PageMargins;
import com.pd4ml.PageSize;

public class GettingStarted2 {
	protected int topValue = 10;
	protected int leftValue = 20;
	protected int rightValue = 10;
	protected int bottomValue = 10;
	protected int userSpaceWidth = 1300;

	public static void main(String[] args) {
		try {
			GettingStarted2 jt = new GettingStarted2();
			jt.doConversion("https://pd4ml.com/i/rtf/demo.htm", "c:/invoice.docx");
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	public void doConversion( String url, String outputPath ) 
				throws InvalidParameterException, MalformedURLException, IOException {
		File output = new File(outputPath);
		java.io.FileOutputStream fos = new java.io.FileOutputStream(output);

		PD4ML pd4ml = new PD4ML();
			
		pd4ml.setHtmlWidth(userSpaceWidth); // set frame width of "virtual web browser" 
			
		// choose target paper format and "rotate" it to landscape orientation
		pd4ml.setPageSize(PageSize.A4.rotate()); 
			
		// define PDF page margins
		pd4ml.setPageMargins(new PageMargins(topValue, leftValue, bottomValue, rightValue, Units.MM)); 

		// read and parse HTML
		pd4ml.readHTML(new URL(url));

		pd4ml.writeDOCX(fos);  // actual document conversion from URL to DOCX file
		fos.close();
			
		System.out.println( outputPath + "\ndone." );
	}
}

package samples;

import java.awt.Insets;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.security.InvalidParameterException;

import org.zefer.pd4ml.PD4Constants;
import org.zefer.pd4ml.PD4ML;

public class GettingStarted2 {
	protected int topValue = 10;
	protected int leftValue = 20;
	protected int rightValue = 10;
	protected int bottomValue = 10;
	protected int userSpaceWidth = 1300;

	public static void main(String[] args) {
		try {
			GettingStarted2 jt = new GettingStarted2();
			jt.doConversion("https://pd4ml.com/i/rtf/demo.htm", "c:/invoice.docx");
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	public void doConversion( String url, String outputPath ) 
				throws InvalidParameterException, MalformedURLException, IOException {
		File output = new File(outputPath);
		java.io.FileOutputStream fos = new java.io.FileOutputStream(output);

		PD4ML pd4ml = new PD4ML();
			
		pd4ml.setHtmlWidth(userSpaceWidth); // set frame width of "virtual web browser" 
			
		// choose target paper format and "rotate" it to landscape orientation
		pd4ml.setPageSize(pd4ml.changePageOrientation(PD4Constants.A4)); 
			
		// define PDF page margins
		pd4ml.setPageInsetsMM(new Insets(topValue, leftValue, bottomValue, rightValue)); 

		// Force generate DOCX instead of PDF
		pd4ml.outputFormat(PD4Constants.DOCX);

		pd4ml.render(new URL(url), fos); // actual document conversion from URL to RTF file
		fos.close();
			
		System.out.println( outputPath + "\ndone." );
	}
}

DOCX conversion samples: