How to use PDFStructTree
Contents
System requirements
Platform: Windows, Mac, Linux (x86 and x64)
Programming Language: C++, C#, Python, Java, OC, C
License Key requirement: Standard
SDK Version: Foxit PDF SDK 9.1
About PDF Structure Tree
The structure tree in a PDF refers to the hierarchical organization and relationships between different elements within the document. It represents the logical structure of the content and is defined using a standard called Tagged PDF. The structure tree enhances accessibility and navigation in PDF documents, allowing individuals using assistive technologies to navigate the content more effectively. It uses predefined tags to describe elements such as headings, paragraphs, lists, tables, and images. PDF viewers and assistive technologies utilize the structure tree to present the document in a meaningful way, offering features like table of contents, text reflow, and navigation by headings. The presence and quality of the structure tree depend on how the document was created, with well-designed documents having a well-defined structure tree and scanned or converted documents potentially lacking accuracy or completeness.
How to use PDFStructTree to traverse objects in tagged PDF
The following code example demonstrates how to use the PDFStructTree class to access objects in PDF.
● Import Standard library and Foxit PDF SDK header file.
// Include c++ standard header files.
#include <map>
#include <vector>
#include <iostream>
// Include Foxit SDK header files.
#include "../../../include/common/fs_common.h"
#include "../../../include/pdf/fs_pdfdoc.h"
#include "../../../include/pdf/fs_pdfpage.h"
#include "../../../include/pdf/annots/fs_annot.h"
#include "../../../include/pdf/graphics/fs_pdfgraphicsobject.h"
#include "../../../include/pdf/objects/fs_pdfstructtree.h"
using namespace foxit;
using namespace foxit::common;
using foxit::common::Library;
using namespace pdf;
using namespace annots;
using namespace graphics;
using namespace objects;
● Define the Struct used for structtree mapping.
struct ContetnObject {
POSITION pos_ = NULL;
bool is_in_form_obj_ = false;
std::vector<int> vec_form_layer_;
};
struct PageObject {
PDFPage page_;
std::map<FX_UINT32, std::vector<ContetnObject>> map_content_object_;
};
● Function declaration.
void GetObjects(PDFPage& page, GraphicsObjects* graphics_container, PDFDictionary* dict, std::map<PDFDictionary*, PageObject>& content_object, std::vector<int>& vec_cur_layer);
void GetContentObject(PDFPage& page, GraphicsObject* graphics_object, std::map<PDFDictionary*, PageObject>& content_object,
POSITION cur_pos, PDFDictionary* dict, int index, std::vector<int>& vec_cur_layer);
● Retrieve the objects from a GraphicsObjects, including graphics objects and content objects.
void GetObjects(PDFPage& page, GraphicsObjects* graphics_container, PDFDictionary* dict, std::map<PDFDictionary*, PageObject>& content_object, std::vector<int>& vec_cur_layer) {
int index = 0;
// Get the position of the first graphics object
POSITION pos = graphics_container->GetFirstGraphicsObjectPosition();
while (pos) {
POSITION cur_pos = pos;
// Get the graphics object at the current position
GraphicsObject* graphics_object = graphics_container->GetGraphicsObject(pos);
// Skip null graphics objects
if (!graphics_object) continue;
// Retrieve the content object associated with the graphics object
GetContentObject(page, graphics_object, content_object, cur_pos, dict, index, vec_cur_layer);
// Move to the next graphics object position
pos = page.GetNextGraphicsObjectPosition(pos);
}
}
● Get the content associated with a graphics object in the GraphicsObject.
void GetContentObject(PDFPage& page, GraphicsObject* graphics_object, std::map<PDFDictionary*, PageObject>& content_object,
POSITION cur_pos, PDFDictionary* dict, int index, std::vector<int>& vec_cur_layer) {
int mc_id = -1;
// Check if the graphics object has marked content
if (graphics_object->GetMarkedContent()) {
int count = graphics_object->GetMarkedContent()->GetItemCount();
mc_id = graphics_object->GetMarkedContent()->GetItemMCID(0);
}
// Get the type of the graphics object
int type = graphics_object->GetType();
switch (type) {
case GraphicsObject::e_TypeText:
case GraphicsObject::e_TypePath:
case GraphicsObject::e_TypeImage:
case GraphicsObject::e_TypeShading: {
// If the graphics object has marked content, store the content object
if (mc_id == -1)
break;
content_object[dict].page_ = page;
ContetnObject object;
object.pos_ = cur_pos;
content_object[dict].map_content_object_[mc_id].push_back(object);
} break;
case GraphicsObject::e_TypeFormXObject: {
// If the graphics object has marked content, store the content object
if (mc_id != -1) {
content_object[dict].page_ = page;
ContetnObject object;
object.pos_ = cur_pos;
object.is_in_form_obj_ = vec_cur_layer.size() > 0;
object.vec_form_layer_ = vec_cur_layer;
content_object[dict].map_content_object_[mc_id].push_back(object);
} else {
// If the graphics object is a form XObject, recursively process its content objects
vec_cur_layer.push_back(index);
FormXObject* form_obj = (FormXObject*)graphics_object;
GraphicsObjects form_objects = form_obj->GetGraphicsObjects();
PDFDictionary* form_dict = form_obj->GetStream()->GetDict();
GetObjects(page, &form_objects, form_dict, content_object, vec_cur_layer);
vec_cur_layer.pop_back();
}
} break;
default:
break;
}
}
● TraversalStructTree function traverses a structure tree. It iterates through the elements in the tree and performs different operations based on their type.
void TraversalStructTree(std::map<PDFDictionary*, PageObject>& map_content_object, StructTreeEntity& struct_tree_entity) {
// Check if the struct_tree_entity is of type StructTreeEntityTypeElement
if (StructTreeEntity::e_StructTreeEntityTypeElement == struct_tree_entity.GetType()) {
// Create a StructElement object from the struct_tree_entity
StructElement element(struct_tree_entity);
// Get the number of child entities in the StructElement
int count = element.GetChildCount();
// Iterate through each child entity
for (int i = 0; i < count; i++) {
StructTreeEntity entity = element.GetChild(i);
// Recursively traverse the child entity
TraversalStructTree(map_content_object, entity);
}
} else if (StructTreeEntity::e_StructTreeEntityTypeMarkedContent == struct_tree_entity.GetType()) { // Check if the struct_tree_entity is of type e_StructTreeEntityTypeMarkedContent
// Create a StructMarkedContent object from the struct_tree_entity
StructMarkedContent marked_content(struct_tree_entity);
PDFDictionary* dict = marked_content.GetStmDict();
// Get the MCID (Marked Content Identifier) of the marked content
int mcid = marked_content.GetMCID();
// Get the PDFPage associated with the marked content
PDFPage page = marked_content.GetPage();
// Get the PDFDictionary associated with the PDFPage
PDFDictionary* page_dict = page.GetDict();
// Find the entry in map_content_object for the given PDFDictionary
auto it_container = map_content_object.find(dict);
if (it_container == map_content_object.end()) return;
// Get the PDFPage object from the map_content_object entry
PDFPage pdf_page = it_container->second.page_;
// Find the entry in map_content_object's map_content_object_ for the given MCID
auto it_mcid = it_container->second.map_content_object_.find(mcid);
if (it_mcid->second.size() <= 0) return;
auto vec_graphics_objects = it_mcid->second;
// Iterate through each graphics object in the vector
for (auto content_object : vec_graphics_objects) {
// Get the GraphicsObject at the specified position from the PDFPage
GraphicsObject* graph_object = pdf_page.GetGraphicsObject(content_object.pos_);
// Check if the graph_object exists and is of type GraphicsObject::e_TypeText
if (graph_object && graph_object->GetType() == GraphicsObject::e_TypeText) {
// Cast the GraphicsObject to a TextObject
TextObject* text_object = (TextObject*)graph_object;
WString text = text_object->GetText();
std::cout << text.UTF8Encode();
}
}
std::cout << std::endl;
} else {
// Create a StructObjectContent object from the struct_tree_entity
StructObjectContent struct_object(struct_tree_entity);
// Check if the object's type is StructObjectContent::e_StructObjectTypeAnnot
if (struct_object.GetObjectType() == StructObjectContent::e_StructObjectTypeAnnot) {
PDFDictionary* dict = struct_object.GetDict();
// Get the PDFPage associated with the object
PDFPage page = struct_object.GetPage();
page.StartParse();
int count = page.GetAnnotCount();
// Iterate through each annotation
for (int i = 0; i < count; i++) {
Annot annot = page.GetAnnot(i);
if (annot.IsEmpty()) continue;
PDFDictionary* annot_dict = annot.GetDict();
// Check if the annotation's dictionary matches the given dictionary
if (annot_dict == dict) {
Annot::Type type = annot.GetType();
}
}
}
}
}
● The main function serves as the entry point for program for PDFStructTree travers.
int main(int argc, char *argv[])
{
int err_ret = 0;
// Initialize library
// The parameter "sn" can be found in the "gsdk_sn.txt" (the string after "SN=") and the "key" can be found in the "gsdk_key.txt" (the string after "Sign=").
ErrorCode error_code = Library::Initialize(sn, key);
if (error_code != foxit::e_ErrSuccess) {
return 1;
}
try {
// Load the document
PDFDoc doc("AboutFoxit.pdf");
ErrorCode error_code = doc.Load();
std::map<PDFDictionary*, PageObject> content_object;
if (doc.IsTaggedPDF()) {
int page_count = doc.GetPageCount();
for (int i = 0; i < page_count; i++) {
std::vector<int> vec_cur_layer;
PDFPage page = doc.GetPage(i);
page.StartParse();
PDFDictionary* dict = page.GetDict();
GetObjects(page, &page, dict, content_object, vec_cur_layer);
}
PDFStructTree struct_tree(doc);
if (!struct_tree.IsEmpty()) {
int count = struct_tree.GetChildCount();
for (int i = 0; i < count; i++) {
StructElement element = struct_tree.GetChild(i);
TraversalStructTree(content_object, element);
}
}
}
} catch (const Exception& e) {
std::cout << e.GetMessage() << std::endl;
err_ret = 1;
} catch (...) {
std::cout << "Unknown Exception" << std::endl;
err_ret = 1;
}
Library::Release();
return err_ret;
}
Updated on October 13, 2023