Documentation

Platform

Foxit PDF SDK for Linux Foxit PDF SDK for Mac Foxit PDF SDK for Windows

How to use PDFStructTree

Contents

System requirements
About PDF Structure Tree
How to use PDFStructTree to traverse objects in tagged PDF

System requirements

Platform: Windows, Mac, Linux (x86 and x64)
Programming Language: C++, C#, Python, Java, OC, C
License Key requirement: Standard
SDK Version: Foxit PDF SDK 9.1

About PDF Structure Tree

The structure tree in a PDF refers to the hierarchical organization and relationships between different elements within the document. It represents the logical structure of the content and is defined using a standard called Tagged PDF. The structure tree enhances accessibility and navigation in PDF documents, allowing individuals using assistive technologies to navigate the content more effectively. It uses predefined tags to describe elements such as headings, paragraphs, lists, tables, and images. PDF viewers and assistive technologies utilize the structure tree to present the document in a meaningful way, offering features like table of contents, text reflow, and navigation by headings. The presence and quality of the structure tree depend on how the document was created, with well-designed documents having a well-defined structure tree and scanned or converted documents potentially lacking accuracy or completeness.

How to use PDFStructTree to traverse objects in tagged PDF

The following code example demonstrates how to use the PDFStructTree class to access objects in PDF.

● Import Standard library and Foxit PDF SDK header file.

// Include c++ standard header files.
#include <map>
#include <vector>
#include <iostream>
// Include Foxit SDK header files.
#include "../../../include/common/fs_common.h"
#include "../../../include/pdf/fs_pdfdoc.h"
#include "../../../include/pdf/fs_pdfpage.h"
#include "../../../include/pdf/annots/fs_annot.h"
#include "../../../include/pdf/graphics/fs_pdfgraphicsobject.h"
#include "../../../include/pdf/objects/fs_pdfstructtree.h"
 
using namespace foxit;
using namespace foxit::common;
using foxit::common::Library;
using namespace pdf;
using namespace annots;
using namespace graphics;
using namespace objects;

● Define the Struct used for structtree mapping.

struct ContetnObject {
  POSITION pos_ = NULL;
  bool is_in_form_obj_ = false;
  std::vector<int> vec_form_layer_;
};
 
struct PageObject {
  PDFPage page_;
  std::map<FX_UINT32, std::vector<ContetnObject>> map_content_object_;
};

● Function declaration.

void GetObjects(PDFPage& page, GraphicsObjects* graphics_container, PDFDictionary* dict, std::map<PDFDictionary*, PageObject>& content_object, std::vector<int>& vec_cur_layer);
void GetContentObject(PDFPage& page, GraphicsObject* graphics_object, std::map<PDFDictionary*, PageObject>& content_object,
  POSITION cur_pos, PDFDictionary* dict, int index, std::vector<int>& vec_cur_layer);

● Retrieve the objects from a GraphicsObjects, including graphics objects and content objects.

void GetObjects(PDFPage& page, GraphicsObjects* graphics_container, PDFDictionary* dict, std::map<PDFDictionary*, PageObject>& content_object, std::vector<int>& vec_cur_layer) {
  int index = 0;
  // Get the position of the first graphics object
  POSITION pos = graphics_container->GetFirstGraphicsObjectPosition();
  while (pos) {
    POSITION cur_pos = pos;
    // Get the graphics object at the current position
    GraphicsObject* graphics_object = graphics_container->GetGraphicsObject(pos);
   // Skip null graphics objects
    if (!graphics_object) continue;
    // Retrieve the content object associated with the graphics object
    GetContentObject(page, graphics_object, content_object, cur_pos, dict, index, vec_cur_layer);
    // Move to the next graphics object position
    pos = page.GetNextGraphicsObjectPosition(pos);
  }
}

● Get the content associated with a graphics object in the GraphicsObject.

void GetContentObject(PDFPage& page, GraphicsObject* graphics_object, std::map<PDFDictionary*, PageObject>& content_object,
  POSITION cur_pos, PDFDictionary* dict, int index, std::vector<int>& vec_cur_layer) {
  int mc_id = -1;
  // Check if the graphics object has marked content
  if (graphics_object->GetMarkedContent()) {
    int count = graphics_object->GetMarkedContent()->GetItemCount();
    mc_id = graphics_object->GetMarkedContent()->GetItemMCID(0);
  }
  // Get the type of the graphics object
  int type = graphics_object->GetType();
  switch (type) {
  case GraphicsObject::e_TypeText:
  case GraphicsObject::e_TypePath:
  case GraphicsObject::e_TypeImage:
  case GraphicsObject::e_TypeShading: {
   // If the graphics object has marked content, store the content object
    if (mc_id == -1)
      break;
    content_object[dict].page_ = page;
    ContetnObject object;
    object.pos_ = cur_pos;
    content_object[dict].map_content_object_[mc_id].push_back(object);
  } break;
  case GraphicsObject::e_TypeFormXObject: {
   // If the graphics object has marked content, store the content object
    if (mc_id != -1) {
      content_object[dict].page_ = page;
      ContetnObject object;
      object.pos_ = cur_pos;
      object.is_in_form_obj_ = vec_cur_layer.size() > 0;
      object.vec_form_layer_ = vec_cur_layer;
      content_object[dict].map_content_object_[mc_id].push_back(object);
    } else {
    // If the graphics object is a form XObject, recursively process its content objects
      vec_cur_layer.push_back(index);
      FormXObject* form_obj = (FormXObject*)graphics_object;
      GraphicsObjects form_objects = form_obj->GetGraphicsObjects();
      PDFDictionary* form_dict = form_obj->GetStream()->GetDict();
      GetObjects(page, &form_objects, form_dict, content_object, vec_cur_layer);
      vec_cur_layer.pop_back();
    }
  } break;
  default:
    break;
  }
}

● TraversalStructTree function traverses a structure tree. It iterates through the elements in the tree and performs different operations based on their type.

void TraversalStructTree(std::map<PDFDictionary*, PageObject>& map_content_object, StructTreeEntity& struct_tree_entity) {
  // Check if the struct_tree_entity is of type StructTreeEntityTypeElement
  if (StructTreeEntity::e_StructTreeEntityTypeElement == struct_tree_entity.GetType()) {
    // Create a StructElement object from the struct_tree_entity
   StructElement element(struct_tree_entity);
   // Get the number of child entities in the StructElement
   int count = element.GetChildCount();
   // Iterate through each child entity
   for (int i = 0; i < count; i++) {
      StructTreeEntity entity = element.GetChild(i);
    // Recursively traverse the child entity
    TraversalStructTree(map_content_object, entity);
    }
  } else if (StructTreeEntity::e_StructTreeEntityTypeMarkedContent == struct_tree_entity.GetType()) { // Check if the struct_tree_entity is of type e_StructTreeEntityTypeMarkedContent
    // Create a StructMarkedContent object from the struct_tree_entity  
    StructMarkedContent marked_content(struct_tree_entity);
    PDFDictionary* dict = marked_content.GetStmDict();
   // Get the MCID (Marked Content Identifier) of the marked content
   int mcid = marked_content.GetMCID();
   // Get the PDFPage associated with the marked content
   PDFPage page = marked_content.GetPage();
    // Get the PDFDictionary associated with the PDFPage
    PDFDictionary* page_dict = page.GetDict();
   // Find the entry in map_content_object for the given PDFDictionary
   auto it_container = map_content_object.find(dict);
    if (it_container == map_content_object.end()) return;
 
    // Get the PDFPage object from the map_content_object entry
   PDFPage pdf_page = it_container->second.page_;
   // Find the entry in map_content_object's map_content_object_ for the given MCID
   auto it_mcid = it_container->second.map_content_object_.find(mcid);
    if (it_mcid->second.size() <= 0) return;
    auto vec_graphics_objects = it_mcid->second;
 
    // Iterate through each graphics object in the vector
   for (auto content_object : vec_graphics_objects) {
    // Get the GraphicsObject at the specified position from the PDFPage
    GraphicsObject* graph_object = pdf_page.GetGraphicsObject(content_object.pos_);
     // Check if the graph_object exists and is of type GraphicsObject::e_TypeText
     if (graph_object && graph_object->GetType() == GraphicsObject::e_TypeText) {
      // Cast the GraphicsObject to a TextObject
      TextObject* text_object = (TextObject*)graph_object;
        WString text = text_object->GetText();
        std::cout << text.UTF8Encode();
      }     
    }
    std::cout << std::endl;
  } else {
   // Create a StructObjectContent object from the struct_tree_entity
   StructObjectContent struct_object(struct_tree_entity);
    // Check if the object's type is StructObjectContent::e_StructObjectTypeAnnot
   if (struct_object.GetObjectType() == StructObjectContent::e_StructObjectTypeAnnot) {
      PDFDictionary* dict = struct_object.GetDict();
    // Get the PDFPage associated with the object
    PDFPage page = struct_object.GetPage();
      page.StartParse();
      int count = page.GetAnnotCount();
     // Iterate through each annotation
     for (int i = 0; i < count; i++) {       
        Annot annot = page.GetAnnot(i);
        if (annot.IsEmpty()) continue;
        PDFDictionary* annot_dict = annot.GetDict();
        // Check if the annotation's dictionary matches the given dictionary
      if (annot_dict == dict) {
          Annot::Type type = annot.GetType();       
        }
      }
    }
  }
}

● The main function serves as the entry point for program for PDFStructTree travers.

int main(int argc, char *argv[])
{
  int err_ret = 0;
  // Initialize library
  // The parameter "sn" can be found in the "gsdk_sn.txt" (the string after "SN=") and the "key" can be found in the "gsdk_key.txt" (the string after "Sign=").
  ErrorCode error_code = Library::Initialize(sn, key);
  if (error_code != foxit::e_ErrSuccess) {
    return 1;
  }
 
  try {
   // Load the document
   PDFDoc doc("AboutFoxit.pdf");
    ErrorCode error_code = doc.Load();
    std::map<PDFDictionary*, PageObject> content_object;
    if (doc.IsTaggedPDF()) {
      int page_count = doc.GetPageCount();
      for (int i = 0; i < page_count; i++) {
        std::vector<int> vec_cur_layer;
        PDFPage page = doc.GetPage(i);
        page.StartParse();
        PDFDictionary* dict = page.GetDict();
        GetObjects(page, &page, dict, content_object, vec_cur_layer);
      }
 
      PDFStructTree struct_tree(doc);
      if (!struct_tree.IsEmpty()) {
        int count = struct_tree.GetChildCount();
        for (int i = 0; i < count; i++) {
          StructElement element = struct_tree.GetChild(i);
          TraversalStructTree(content_object, element);
        }
      }
    }
  } catch (const Exception& e) {
    std::cout << e.GetMessage() << std::endl;
    err_ret = 1;
  } catch (...) {
    std::cout << "Unknown Exception" << std::endl;
    err_ret = 1;
  }
  Library::Release();
  return err_ret;
}

Updated on October 13, 2023

Was this article helpful?