Friday, February 23, 2018

Remove blank PDF pages with PDFBox

I came across a challenging issue whereby I would need to remove blank pages programmatically using Java. A quick search online yield a suggestions on how this can be achieved with iText (for those of you wanting to achieve the same with the iText library, have a look at this blog post Detect and remove blank page in pdf (iText).

As for me, I wanted to use the PDFBox library since I have already been using it in my project. My online search yielded no good results. In fact, the only link I have found is somewhat useful is the question in Stackflow - How to find blank pages inside a PDF using PDFBox? .

Ok, time to get down to work and craft some code to do this.... and here it is.


 public static final Map getImageObjectMap(PDPage page) throws IOException {
  
  PDResources resources = page.getResources();
  
  Map objects = new HashMap();
       
  
  for ( COSName cosObject : resources.getXObjectNames() ) 
  {
   PDXObject xObj = resources.getXObject( cosObject);
   
   //if (xObj instanceof PDImageXObject)    
   if (xObj instanceof PDXObject)
    objects.put(cosObject.getName(), xObj);
  }     
  
  if (objects.isEmpty())
   return null;
  
  else
   return objects;
  
 }


 public static boolean removeBlankPage(PDDocument pdfDoc, PDPage page, int pageNum) throws Exception
 {

  boolean pageIsRemoved = false;         
  boolean hasNoImages = true;
  boolean hasNoText = true;
  boolean hasNoPathObjects = true; 
  boolean hasNoShading = true;
  
  // check no images 
  hasNoImages =  getImageObjectMap(page) == null;   
       
  // Check if page has Path Objects (Graphics such as lines, boxes etc)  
  COSBase contentCOS = page.getCOSObject().getDictionaryObject(COSName.CONTENTS);
  String contentString = ((COSStream)contentCOS).toTextString();  
  String[] contents = contentString.split("\n");
  
    
  for (String c : contents)
  {
         // Check No graphics (path objects, such as lines) on page
   // m - start path
   // l  - straight line
   // c - curve
   // etc See. Path constructing operators pg 163, Table 4.9, PDF Reference
   hasNoPathObjects = !(c.endsWith(" m") || c.endsWith(" re") ||
               c.endsWith(" l") || c.endsWith(" c") ||
               c.endsWith(" v") || c.endsWith(" y") ||
               c.endsWith(" h") || c.endsWith("f")
              );
   
   // check no shading
   hasNoShading = ! (c.endsWith(" sh"));
   
   if (!hasNoPathObjects || !hasNoShading)
    break;
  }
        
         
         // Check No text content on page
         org.apache.pdfbox.text.PDFTextStripper txtstripper = new org.apache.pdfbox.text.PDFTextStripper();
         txtstripper.setStartPage(pageNum+1);
         txtstripper.setEndPage(pageNum+1);
         String pageText = txtstripper.getText(pdfDoc);
         hasNoText = pageText.trim().isEmpty();
                  
         
         if (hasNoImages && hasNoText && hasNoPathObjects && hasNoShading)
         {        
              // remove the page now
                pdfDoc.removePage(pageNum);
             pageIsRemoved = true;
         }  
                  
               return pageIsRemoved; 
 }
 
 
 public static void removeBlankPages(PDDocument pdfDoc) throws Exception
 {
  
  PDPageTree pageTree = pdfDoc.getPages();
  
  for (int i=0; i < pageTree.getCount(); i++)
  {
   PDPage page = pageTree.get(i);
   boolean pageIsRemoved = removeBlankPage(pdfDoc, page, i);
   
   // if page is remove, have to reduce the iteration count
   if (pageIsRemoved) i--;
  }
  
 }

No comments:

Post a Comment