Detecting PAGE_BREAK element in Google docs using app script

Question

Problem statement:

I have a google doc with N number of pages. Requirement is to convert each page as separate Google doc.

Solution tried:

Tried to find PAGE_BREAK element by parsing the body and when PAGE_BREAK is detected, create a Range, copy the content and then create a new Google Document.

Issue:

Created a sample doc with one line text on Page 1 and one line text on Page 2. When parsing the doc, not able to detect PAGE_BREAK element. I was expecting when the content flows to Page 2, there should be PAGE_BREAK which in this case should be children of PARAGRAPH Element. Below is a sample Google app script code snippet which I have tried:

var activeDocument = DocumentApp.getActiveDocument();
var body = activeDocument.getBody();

function resetDoc() {
  body.clear(); 
  // When PAGE BREAK is added via script, I am able to detect the PAGE_BREAK element using findElement method only.
  // body.appendParagraph("Page 1");
  // body.appendPageBreak();
  // body.appendParagraph("Page 2");
}


function init() {
  const tree = extractTree(body);
  Logger.log(tree);
}


function extractTree(element) {
  const node = {
    element: element,
  };
  if (element.getNumChildren) {
    var numChildren = element.getNumChildren();

    var children = [];

    for (var i = 0; i < numChildren; i++) {
      var child = element.getChild(i);
      var found = findBreak(element);
      if(found)
      {
        Logger.log("Found page break at" + i );
      }
      var childNode = extractTree(child);
      Logger.log(child.getType());
      children.push(childNode);
    }

    node["children"] = children;
  }

  return node;
};


function findBreak(element) {
  var searchType = DocumentApp.ElementType.PAGE_BREAK;
  var breakElement = body.findElement(searchType);
  if(breakElement) {
    Logger.log("Found page break");
    return true;
  } else {
    Logger.log("No page break");
    return false;
  }
}

Any suggestions on how should I go about solving this problem.

Logs:

[19-04-12 15:46:32:636 IST] TEXT
[19-04-12 15:46:32:637 IST] PARAGRAPH
[19-04-12 15:46:32:638 IST] PARAGRAPH
[19-04-12 15:46:32:640 IST] PARAGRAPH
[19-04-12 15:46:32:642 IST] PARAGRAPH
[19-04-12 15:46:32:643 IST] PARAGRAPH
[19-04-12 15:46:32:645 IST] PARAGRAPH
[19-04-12 15:46:32:647 IST] PARAGRAPH
[19-04-12 15:46:32:648 IST] PARAGRAPH
[19-04-12 15:46:32:650 IST] PARAGRAPH
[19-04-12 15:46:32:651 IST] PARAGRAPH
[19-04-12 15:46:32:653 IST] PARAGRAPH
[19-04-12 15:46:32:655 IST] PARAGRAPH
[19-04-12 15:46:32:656 IST] PARAGRAPH
[19-04-12 15:46:32:658 IST] PARAGRAPH
[19-04-12 15:46:32:660 IST] PARAGRAPH
[19-04-12 15:46:32:662 IST] PARAGRAPH
[19-04-12 15:46:32:663 IST] PARAGRAPH
[19-04-12 15:46:32:665 IST] PARAGRAPH
[19-04-12 15:46:32:666 IST] PARAGRAPH
[19-04-12 15:46:32:668 IST] PARAGRAPH
[19-04-12 15:46:32:670 IST] PARAGRAPH
[19-04-12 15:46:32:671 IST] PARAGRAPH
[19-04-12 15:46:32:673 IST] PARAGRAPH
[19-04-12 15:46:32:675 IST] PARAGRAPH
[19-04-12 15:46:32:676 IST] PARAGRAPH
[19-04-12 15:46:32:678 IST] PARAGRAPH
[19-04-12 15:46:32:680 IST] PARAGRAPH
[19-04-12 15:46:32:682 IST] PARAGRAPH
[19-04-12 15:46:32:684 IST] PARAGRAPH
[19-04-12 15:46:32:685 IST] PARAGRAPH
[19-04-12 15:46:32:687 IST] PARAGRAPH
[19-04-12 15:46:32:689 IST] PARAGRAPH
[19-04-12 15:46:32:690 IST] PARAGRAPH
[19-04-12 15:46:32:692 IST] PARAGRAPH
[19-04-12 15:46:32:693 IST] PARAGRAPH
[19-04-12 15:46:32:695 IST] PARAGRAPH
[19-04-12 15:46:32:697 IST] PARAGRAPH
[19-04-12 15:46:32:699 IST] PARAGRAPH
[19-04-12 15:46:32:701 IST] PARAGRAPH
[19-04-12 15:46:32:702 IST] PARAGRAPH
[19-04-12 15:46:32:704 IST] PARAGRAPH
[19-04-12 15:46:32:705 IST] PARAGRAPH
[19-04-12 15:46:32:706 IST] PARAGRAPH
[19-04-12 15:46:32:708 IST] TEXT
[19-04-12 15:46:32:709 IST] PARAGRAPH
[19-04-12 15:46:32:710 IST] PARAGRAPH
[19-04-12 15:46:32:711 IST] PARAGRAPH
[19-04-12 15:46:32:712 IST] {children=[{children=[{element=Text}], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[{element=Text}], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}], element=DocumentBodySection}

[19-04-12 15:46:32:706 IST] PARAGRAPH should be PAGE_BREAK, but it comes as a PARAGRAPH.

Sample google document:
https://docs.google.com/document/d/1bs_Jcfs-n1VEx65Ew5buBpsf_JCHgX0A7NHYIY8mAqw/edit?usp=sharing

Reference link:
1. Google app script documentation
https://developers.google.com/apps-script/reference/document/page-break

Александр Ермолин Александр Ермолин · Accepted Answer · 2019-04-13T10:50:13

First of all, I am not sure to understand the task correctly, because in general case having N visual pages does not mean having N-1 explicit page breaks. I've suggested that you want to use only explicit page breaks, as you already have some attempts to find them.

The most useful unit (object) for copying document fragments is Paragraph in this case. The following function takes all document paragraphs and checks each of them to include PAGE_BREAK element. If PAGE_BREAK is found, then it means the end of one page and the begin of another. Of course, we should create a new target document at this time to continue copying.

function copyPartsByPageBreaks() {
  var activeDoc = DocumentApp.getActiveDocument();
  var pars = activeDoc.getBody().getParagraphs();
  var pageIndex = 0;
  var targetBody = DocumentApp.create('PageBreak.' + pageIndex).getBody();
  while (pars.length > 0) {
    var p = pars.shift();
    targetBody.appendParagraph(p.copy());
    if (p.findElement(DocumentApp.ElementType.PAGE_BREAK) != null) {
      pageIndex++;  // Prepare a new target place for coping
      targetBody = DocumentApp.create('PageBreak.' + pageIndex).getBody();
    }
  }
}

Detecting PAGE_BREAK element in Google docs using app script

Problem statement:

Solution tried:

Issue:

1 Answers