How to Identify PDFs using Layout Recognition
This article aims to demonstrate how to identify a PDF using Layout Recognition.
The data structure parsed by Layout Recognition is a bit like a tree structure, the root node is class LRStructureElement. Generally, class LRStructureElement has many child nodes, including class LRGraphicsObjectElement, class LRContentElement or class LRStructureElement. LRContentElement may have a corresponding LRGraphicsObjectElement,, whereas LRGraphicsObjectElement generally does not contain child nodes.
We use Foxit PDF SDK to do it and you can get a free 30 day trial here.
Platform: Windows, Linux, Mac
Programming Language: C++, Java, .NET, .NET Core, Objective-C, C
License Key Requirement: Standard with Layout Recognition
SDK Version: Foxit PDF SDK 8.4
The following are some code examples. The function IdentifyPage is the main function. This function can identify the page’s layout, and than save the data to the text.
C++:
//The class used for writing text.
class TextDoc {
public:
TextDoc(const String& file_name, const String& fill_mode);
TextDoc(const WString& file_name, const WString& fill_mode);
~TextDoc();
void Write(const char* text_content);
void Write(const wchar_t* text_content);
void Write(int count, const wchar_t* prefix, const wchar_t* text_content);
private:
FILE* file_;
};
TextDoc::TextDoc(const String& file_name, const String& file_mode) throw(Exception)
: file_(NULL) {
#if defined(_WIN32) || defined(_WIN64)
fopen_s(&file_, file_name, (const char*)file_mode);
#else
file_ = fopen((const char*)file_name, (const char*)file_mode);
#endif
if (!file_)
throw Exception(__FILE__, __LINE__, __FUNCTION__, foxit::e_ErrFile);
uint8 temp[] = {0xFF, 0xFE};
fwrite(temp, sizeof(uint8), 2, file_);
fseek(file_, 0, SEEK_END);
}
TextDoc::TextDoc(const WString& file_name, const WString& file_mode) throw(Exception)
: file_(NULL) {
String s_file_name = String::FromUnicode(file_name);
String s_file_mode = String::FromUnicode(file_mode);
#if defined(_WIN32) || defined(_WIN64)
fopen_s(&file_, (const char*)s_file_name, (const char*)s_file_mode);
#else
file_ = fopen((const char*)s_file_name, (const char*)s_file_mode);
#endif
if (!file_)
throw Exception(__FILE__, __LINE__, __FUNCTION__, foxit::e_ErrFile);
uint8 temp[] = {0xFF, 0xFE};
fwrite(temp, sizeof(uint8), 2, file_);
fseek(file_, 0, SEEK_END);
}
TextDoc::~TextDoc() {
fclose(file_);
file_ = NULL;
}
void TextDoc::Write(const char* text_content) {
WString wide_str = WString::FromLocal(text_content);
Write(wide_str);
}
void TextDoc::Write(const wchar_t* text_content) {
WString wide_str(text_content);
if (wide_str.IsEmpty())
return;
String utf16le_str = wide_str.UTF16LE_Encode(false);
if (utf16le_str.IsEmpty())
return;
int length = utf16le_str.GetLength();
fwrite((const char*)utf16le_str, sizeof(char), length, file_);
}
void TextDoc::Write(int count, const wchar_t* prefix, const wchar_t* text_content) {
for (int i = 0; i < count; i++) {
Write(prefix);
}
Write(text_content);
}
typedef CFX_ArrayTemplate<LRElement> LRElementArray;
WString LR_Format(bool val) {
return val ? L"True" : L"False";
}
WString LR_Format(int32 val) {
WString s;
s.Format(L"%d", val);
return s;
}
WString LR_Format(float val) {
WString s;
s.Format(L"%.1f", val);
return s;
}
WString LR_Format(WString val) {
return val;
}
WString LR_Format(ARGB val) {
WString s;
s.Format(L"#%02X%02X%02X", (uint8)(val >> 16), (uint8)(val >> 8), (uint8)(val));
return s;
}
WString LR_Format(LRStructureElement::AttributeValueEnum val) {
return WString::FromLocal(LRStructureElement::StringifyEnumVariant(val));
}
//Output the structure element.
void OutputLRStructureElement(LRStructureElement element, TextDoc& doc, int depth) {
if (depth > 32)
return;
if (element.IsEmpty())
return;
if (element.IsEmpty())
return;
LRStructureElement parentElement = element.GetParentElement();
bool bIsEmpty = parentElement.IsEmpty();
LRStructureElement::ElementType elemType = element.GetElementType();
String elementTypeStr = LRElement::StringifyElementType(elemType);
WString outputStr = L"< StructureElement: " + WString::FromLocal(elementTypeStr) + L" >\r\n";
doc.Write(depth, L"\t", outputStr);
//Get supported attribute count.
int32 nSize = element.GetSupportedAttributeCount();
for (int i = 0; i < nSize; i++) {
//Get a supported attribute type by index.
LRStructureElement::AttributeType attrType = element.GetSupportedAttribute(i);
//Get attribute value type for a specified attribute type.
LRStructureElement::AttributeValueType attrValueType = element.GetAttributeValueType(attrType);
int nLength = 0;
bool bIsArray = LRStructureElement::IsArrayAttributeValueType(attrValueType);
nLength = element.GetAttributeValueCount(attrType);
if (attrValueType == LRStructureElement::e_AttributeValueTypeEmpty)
continue;
//Convert the enum value of a specified attribute type to a short and memorable string (text) representation.
WString szKey = WString::FromLocal(LRStructureElement::StringifyAttributeType(attrType));
WString szVal = L"";
if (bIsArray)
szVal += L"[";
for (int idx = 0; idx < nLength; idx++) {
if (idx)
szVal += L", ";
switch (attrValueType) {
case LRStructureElement::e_AttributeValueTypeEnum:
case LRStructureElement::e_AttributeValueTypeEnumArray:
szVal += LR_Format(element.GetAttributeValueEnum(attrType, idx));
break;
case LRStructureElement::e_AttributeValueTypeInt32:
case LRStructureElement::e_AttributeValueTypeInt32Array:
szVal += LR_Format(element.GetAttributeValueInt32(attrType, idx));
break;
case LRStructureElement::e_AttributeValueTypeFloat:
case LRStructureElement::e_AttributeValueTypeFloatArray:
szVal += LR_Format(element.GetAttributeValueFloat(attrType, idx));
break;
case LRStructureElement::e_AttributeValueTypeARGB:
case LRStructureElement::e_AttributeValueTypeARGBArray:
szVal += LR_Format(element.GetAttributeValueARGB(attrType, idx));
break;
case LRStructureElement::e_AttributeValueTypeWStr:
case LRStructureElement::e_AttributeValueTypeWStrArray:
szVal += LR_Format(element.GetAttributeValueString(attrType, idx));
break;
default:
break;
}
}
if (bIsArray)
szVal += L"]";
outputStr = szKey + L": " + szVal + L"\r\n";
doc.Write(depth, L"\t", outputStr);
}
}
//Output the graphics object element.
void OutputLRGraphicsObjectElement(LRGraphicsObjectElement element, TextDoc& doc, int depth) {
if (element.IsEmpty())
return;
WString outputStr = L"< LRGraphicsObjectElement: >\r\n";
LRGraphicsObjectElement parentPageObj = element.GetParentGraphicsObjectElement();
bool bIsEmpty = parentPageObj.IsEmpty();
doc.Write(depth, L"\t", outputStr);
//Get bounding box, matrix, index of related graphics object.
RectF rcBox = element.GetBBox();
int objIndex = element.GetGraphicsObjectIndex();
Matrix matrix = element.GetMatrix();
WString szVal = L"";
szVal.Format(L"BBox: [%.1f,%.1f,%.1f,%.1f]\r\n", rcBox.left, rcBox.top, rcBox.right, rcBox.bottom);
doc.Write(depth, L"\t", szVal);
szVal.Format(L"Matrix: [%.1f,%.1f,%.1f,%.1f,%.1f,%.1f]\r\n", matrix.a, matrix.b, matrix.c, matrix.d, matrix.e,
matrix.f);
doc.Write(depth, L"\t", szVal);
szVal.Format(L"PageObjectIndex: %d\r\n", objIndex);
doc.Write(depth, L"\t", szVal);
GraphicsObject* pObject = element.GetGraphicsObject();
WString text = (pObject && pObject->GetType() == GraphicsObject::e_TypeText)
? element.GetGraphicsObject()->GetTextObject()->GetText()
: L"";
if (!text.IsEmpty()) {
szVal = L"Text: ";
szVal += text;
szVal += L"\r\n";
doc.Write(depth, L"\t", szVal);
}
}
//Output the content element.
void OutputLRContentElement(LRContentElement element, TextDoc& doc, int depth) {
if (element.IsEmpty())
return;
LRGraphicsObjectElement pageObj = element.GetGraphicsObjectElement();
bool bIsEmpty = pageObj.IsEmpty();
// Get the parent element.
LRStructureElement parentPageObj = element.GetParentElement();
bIsEmpty = parentPageObj.IsEmpty();
LRElement::ElementType elemType = element.GetElementType();
String elementTypeStr = LRStructureElement::StringifyElementType(elemType);
WString outputStr = L"< LRContentElement: " + WString::FromLocal(elementTypeStr) + L" >\r\n";
doc.Write(depth, L"\t", outputStr);
//Get bounding box.
RectF rcBox = element.GetBBox();
int32 nStartPos = 0, nLength = 0;
common::Range range = element.GetGraphicsObjectRange();
if (!range.IsEmpty()) {
nStartPos = range.GetSegmentStart(0);
nLength = range.GetSegmentEnd(0) - nStartPos + 1;
}
//Get matrix.
Matrix matrix = element.GetMatrix();
WString szVal = L"";
szVal.Format(L"BBox: [%.1f,%.1f,%.1f,%.1f]\r\n", rcBox.left, rcBox.top, rcBox.right, rcBox.bottom);
doc.Write(depth, L"\t", szVal);
szVal.Format(L"Matrix: [%.1f,%.1f,%.1f,%.1f,%.1f,%.1f]\r\n", matrix.a, matrix.b, matrix.c, matrix.d, matrix.e,
matrix.f);
doc.Write(depth, L"\t", szVal);
szVal.Format(L"StartPos: %d\r\n", nStartPos);
doc.Write(depth, L"\t", szVal);
szVal.Format(L"Length: %d\r\n", nLength);
doc.Write(depth, L"\t", szVal);
//If content element has the graphics object element, output the graphics object element.
if (!bIsEmpty)
OutputLRGraphicsObjectElement(pageObj, doc, depth + 1);
}
//Show the element information of the root element.
void ShowLRElementInfo(LRStructureElement element, TextDoc& doc, int depth) {
OutputLRStructureElement(element, doc, depth);
int nElemListSize = element.GetChildCount();
//Get child of the structure element.
for (int i = 0; i < nElemListSize; i++) {
LRElement item = element.GetChild(i);
uint32 item_type = item.GetElementType();
if (item.IsStructureElement()) {
//Continue to get child of the structure element.
LRStructureElement srt_item(item);
ShowLRElementInfo(srt_item, doc, depth + 1);
} else if (item.IsContentElement()) {
//Output the content element.
LRContentElement srt_item(item);
OutputLRContentElement(srt_item, doc, depth);
} else if (item.IsGraphicsObjectElement()) {
//Output the graphics object element.
LRGraphicsObjectElement srt_item(item);
OutputLRGraphicsObjectElement(srt_item, doc, depth);
}
}
}
//Identify the page.
void IdentifyPage(PDFPage page, const WString& info_path) {
TextDoc text_doc(info_path, L"w+b");
LRContext context(page);
context.StartParse();
LRStructureElement root = context.GetRootElement();
if (!root.IsEmpty()) {
ShowLRElementInfo(root, text_doc, 0);
} else {
text_doc.Write("No layout recognition information!\r\n");
}
}
.NET & .NET Core:
//The class used for writing text.
class TextDoc
{
private FileStream mfile_stream;
public TextDoc(string file_name, FileMode fill_mode)
{
//FileStream F = new FileStream(file_name, FileMode.Open, FileAccess.Read, FileShare.Read);
mfile_stream = new FileStream(file_name, fill_mode);
}
~TextDoc()
{
try
{
mfile_stream.Flush();
mfile_stream.Close();
}
catch (System.ObjectDisposedException e)
{
string str = e.Message;
}
}
public void Write(int count, string prefix, string contents)
{
string prefix_all = "";
for (int i = 0; i < count; i++)
{
prefix_all = prefix_all + prefix;
}
contents = prefix_all + contents;
byte[] byte_data = System.Text.Encoding.Default.GetBytes(contents);
// Move file pointer to beginning of file.
mfile_stream.Seek(0, SeekOrigin.End);
mfile_stream.Write(byte_data, 0, byte_data.Length);
mfile_stream.Flush();
}
public void Write(string contents)
{
byte[] byte_data = System.Text.Encoding.Default.GetBytes(contents);
// Move file pointer to beginning of file.
mfile_stream.Seek(0, SeekOrigin.End);
mfile_stream.Write(byte_data, 0, byte_data.Length);
mfile_stream.Flush();
}
public string Read()
{
int fs_Len = (int)mfile_stream.Length;
byte[] byte_data = new byte[fs_Len];
int r = mfile_stream.Read(byte_data, 0, byte_data.Length);
string contents = System.Text.Encoding.UTF8.GetString(byte_data);
return contents;
}
};
class layout_recognition
{
public static String LR_Format(bool val)
{
return val ? "True" : "False";
}
public static String LR_Format(int val)
{
return String.Format("{0}", val);
}
public static String LR_Format(float val)
{
return String.Format("{0:0.0}", val);
}
public static String LR_FormatARGB(uint val)
{
return String.Format("#{0}{1}{2}", ((Int32)(char)(val >> 16)).ToString("X2"), ((Int32)(char)(val >> 8)).ToString("X2"), ((Int32)(char)val).ToString("X2"));
}
public static String LR_FormatAttributeValueEnum(LRStructureElement.AttributeValueEnum val)
{
return LRStructureElement.StringifyEnumVariant(val);
}
//Output the structure element.
private static void OutputLRStructureElement(LRStructureElement element, TextDoc doc, int depth)
{
if (element.IsEmpty())
return;
LRStructureElement parentElement = element.GetParentElement();
bool bIsEmpty = parentElement.IsEmpty();
LRElement.ElementType elemType = element.GetElementType();
String elementTypeStr = LRStructureElement.StringifyElementType(elemType);
String outputStr = "< StructureElement: " + elementTypeStr + " >\r\n";
doc.Write(depth, "\t", outputStr);
int nSize = element.GetSupportedAttributeCount();
for (int i = 0; i < nSize; i++)
{
LRStructureElement.AttributeType attrType = element.GetSupportedAttribute(i);
LRStructureElement.AttributeValueType attrValueType = element.GetAttributeValueType(attrType);
int nLength = 0;
bool bIsArray = LRStructureElement.IsArrayAttributeValueType(attrValueType);
nLength = element.GetAttributeValueCount(attrType);
if (attrValueType == LRStructureElement.AttributeValueType.e_AttributeValueTypeEmpty)
continue;
String szKey = LRStructureElement.StringifyAttributeType(attrType);
String szVal = "";
if (bIsArray) szVal += "[";
for (int idx = 0; idx < nLength; idx++)
{
if (idx > 0) szVal += ", ";
switch (attrValueType)
{
case LRStructureElement.AttributeValueType.e_AttributeValueTypeEnum:
case LRStructureElement.AttributeValueType.e_AttributeValueTypeEnumArray:
szVal += LR_FormatAttributeValueEnum(element.GetAttributeValueEnum(attrType, idx));
break;
case LRStructureElement.AttributeValueType.e_AttributeValueTypeInt32:
case LRStructureElement.AttributeValueType.e_AttributeValueTypeInt32Array:
szVal += LR_Format(element.GetAttributeValueInt32(attrType, idx));
break;
case LRStructureElement.AttributeValueType.e_AttributeValueTypeFloat:
case LRStructureElement.AttributeValueType.e_AttributeValueTypeFloatArray:
szVal += LR_Format(element.GetAttributeValueFloat(attrType, idx));
break;
case LRStructureElement.AttributeValueType.e_AttributeValueTypeARGB:
case LRStructureElement.AttributeValueType.e_AttributeValueTypeARGBArray:
szVal += LR_FormatARGB(element.GetAttributeValueARGB(attrType, idx));
break;
case LRStructureElement.AttributeValueType.e_AttributeValueTypeWStr:
case LRStructureElement.AttributeValueType.e_AttributeValueTypeWStrArray:
szVal += element.GetAttributeValueString(attrType, idx);
break;
}
}
if (bIsArray) szVal += "]";
outputStr = szKey + ": " + szVal + "\r\n";
doc.Write(depth, "\t", outputStr);
}
}
//Output the graphics object element.
private static void OutputLRGraphicsObjectElement(LRGraphicsObjectElement element, TextDoc doc, int depth)
{
if (element.IsEmpty())
return;
String outputStr = "< LRGraphicsObjectElement: >\r\n";
LRGraphicsObjectElement parentPageObj = element.GetParentGraphicsObjectElement();
bool bIsEmpty = parentPageObj.IsEmpty();
doc.Write(depth, "\t", outputStr);
GraphicsObject pageObj = element.GetGraphicsObject();
GraphicsObject.Type type = 0;
if (pageObj != null)
type = pageObj.GetType();
RectF rcBox = element.GetBBox();
PDFDictionary pDict = element.GetDict();
int objIndex = element.GetGraphicsObjectIndex();
Matrix2D matrix = element.GetMatrix();
String szVal = "";
szVal = String.Format("BBox: [{0:0.0},{1:0.0},{2:0.0},{3:0.0}]\r\n", rcBox.left, rcBox.top, rcBox.right, rcBox.bottom);
doc.Write(depth, "\t", szVal);
szVal = String.Format("Matrix: [{0:0.0},{1:0.0},{2:0.0},{3:0.0},{4:0.0},{5:0.0}]\r\n", matrix.a, matrix.b, matrix.c, matrix.d, matrix.e, matrix.f);
doc.Write(depth, "\t", szVal);
szVal = String.Format("PageObjectIndex: {0}\r\n", objIndex);
doc.Write(depth, "\t", szVal);
String text = (type == GraphicsObject.Type.e_TypeText) ? pageObj.GetTextObject().GetText() : "";
if (text.Length > 0)
{
szVal = "Text: ";
szVal += text;
szVal += "\r\n";
doc.Write(depth, "\t", szVal);
}
}
//Output the content element.
private static void OutputLRContentElement(LRContentElement element, TextDoc doc, int depth)
{
if (element.IsEmpty())
return;
LRGraphicsObjectElement pageObj = element.GetGraphicsObjectElement();
bool bIsEmpty = pageObj.IsEmpty();
LRStructureElement parentPageObj = element.GetParentElement();
bIsEmpty = parentPageObj.IsEmpty();
LRElement.ElementType elemType = element.GetElementType();
String elementTypeStr = LRElement.StringifyElementType(elemType);
String outputStr = "< LRContentElement: " + elementTypeStr + " >\r\n";
doc.Write(depth, "\t", outputStr);
RectF rcBox = element.GetBBox();
int nStartPos = 0, nLength = 0;
Range range = element.GetGraphicsObjectRange();
if (!range.IsEmpty())
{
nStartPos = range.GetSegmentStart(0);
nLength = range.GetSegmentEnd(0) - nStartPos + 1;
}
Matrix2D matrix = element.GetMatrix();
String szVal = "";
szVal = String.Format("BBox: [{0:0.0},{1:0.0},{2:0.0},{3:0.0}]\r\n", rcBox.left, rcBox.top, rcBox.right, rcBox.bottom);
doc.Write(depth, "\t", szVal);
szVal = String.Format("Matrix: [{0:0.0},{1:0.0},{2:0.0},{3:0.0},{4:0.0},{5:0.0}]\r\n", matrix.a, matrix.b, matrix.c, matrix.d, matrix.e, matrix.f);
doc.Write(depth, "\t", szVal);
szVal = String.Format("StartPos: {0}\r\n", nStartPos);
doc.Write(depth, "\t", szVal);
szVal = String.Format("Length: {0}\r\n", nLength);
doc.Write(depth, "\t", szVal);
//If content element has the graphics object element, output the graphics object element.
if (!bIsEmpty)
OutputLRGraphicsObjectElement(pageObj, doc, depth + 1);
}
//Show the element information of the root element.
private static void ShowLRElementInfo(LRStructureElement element, TextDoc doc, int depth)
{
int nElemListSize = element.GetChildCount();
OutputLRStructureElement(element, doc, depth);
for (int i = 0; i < nElemListSize; i++)
{
//Get child of the structure element.
LRElement item = element.GetChild(i);
LRElement.ElementType item_type = item.GetElementType();
if (item.IsStructureElement())
{
//Continue to get child of the structure element.
LRStructureElement srt_item = new LRStructureElement(item);
ShowLRElementInfo(srt_item, doc, depth + 1);
}
else if (item.IsContentElement())
{
//Output the content element.
LRContentElement srt_item = new LRContentElement(item);
OutputLRContentElement(srt_item, doc, depth);
}
else if (item.IsGraphicsObjectElement())
{
//Output the graphics object element.
LRGraphicsObjectElement srt_item = new LRGraphicsObjectElement(item);
OutputLRGraphicsObjectElement(srt_item, doc, depth);
}
}
}
//Identify the page.
private static void IdentifyPage(PDFPage page, String info_path)
{
TextDoc text_doc = new TextDoc(info_path, FileMode.Create);
using (LRContext context = new LRContext(page))
{
Progressive progressive = context.StartParse(null);
LRStructureElement root = context.GetRootElement();
if (!root.IsEmpty())
{
ShowLRElementInfo(root, text_doc, 0);
}
else
{
text_doc.Write("No layout recognition information!\r\n");
}
}
}
}
Java:
public static void writeTextWithPrefix(OutputStreamWriter doc, int depth, String prefix, String content) throws IOException {
for (int i = 0; i < depth; i++) {
doc.write(prefix);
}
doc.write(content);
}
private static String lr_Format(boolean val) {
return val ? "True" : "False";
}
private static String lr_Format(int val) {
return String.format("%d", val);
}
private static String lr_Format(float val) {
return String.format("%.1f", val);
}
private static String lr_Format(long val) {
return String.format("#%02X%02X%02X", (byte)(val >> 16), (byte)(val >> 8), (byte)(val));
}
private static String lr_FormatAttributeValueEnum(int val) {
return LRStructureElement.stringifyEnumVariant(val);
}
//Output the structure element.
private static void OutputLRStructureElement(LRStructureElement element, OutputStreamWriter doc, int depth) throws PDFException, IOException {
if (element.isEmpty())
return;
LRStructureElement parentElement = element.getParentElement();
boolean bIsEmpty = parentElement.isEmpty();
int elemType = element.getElementType();
String elementTypeStr = LRStructureElement.stringifyElementType(elemType);
String outputStr = "< StructureElement: " + elementTypeStr + " >\r\n";
writeTextWithPrefix(doc, depth, "\t", outputStr);
int nSize = element.getSupportedAttributeCount();
for (int i=0; i<nSize; i++) {
int attrType = element.getSupportedAttribute(i);
int attrValueType = element.getAttributeValueType(attrType);
int nLength = 0;
boolean bIsArray = LRStructureElement.isArrayAttributeValueType(attrValueType);
nLength = element.getAttributeValueCount(attrType);
if (attrValueType == LRStructureElement.e_AttributeValueTypeEmpty)
continue;
String szKey = LRStructureElement.stringifyAttributeType(attrType);
String szVal = "";
if (bIsArray) szVal += "[";
for (int idx = 0; idx < nLength; idx++)
{
if (idx > 0) szVal += ", ";
switch (attrValueType)
{
case LRStructureElement.e_AttributeValueTypeEnum:
case LRStructureElement.e_AttributeValueTypeEnumArray:
szVal += lr_FormatAttributeValueEnum(element.getAttributeValueEnum(attrType, idx));
break;
case LRStructureElement.e_AttributeValueTypeInt32:
case LRStructureElement.e_AttributeValueTypeInt32Array:
szVal += lr_Format(element.getAttributeValueInt32(attrType, idx));
break;
case LRStructureElement.e_AttributeValueTypeFloat:
case LRStructureElement.e_AttributeValueTypeFloatArray:
szVal += lr_Format(element.getAttributeValueFloat(attrType, idx));
break;
case LRStructureElement.e_AttributeValueTypeARGB:
case LRStructureElement.e_AttributeValueTypeARGBArray:
szVal += lr_Format(element.getAttributeValueARGB(attrType, idx));
break;
case LRStructureElement.e_AttributeValueTypeWStr:
case LRStructureElement.e_AttributeValueTypeWStrArray:
szVal += element.getAttributeValueString(attrType, idx);
break;
}
}
if (bIsArray) szVal += "]";
outputStr = szKey + ": " + szVal + "\r\n";
writeTextWithPrefix(doc, depth, "\t", outputStr);
}
}
//Output the graphics object element.
private static void OutputLRGraphicsObjectElement(LRGraphicsObjectElement element, OutputStreamWriter doc, int depth) throws PDFException, IOException {
if (element.isEmpty())
return;
String outputStr = "< LRGraphicsObjectElement: >\r\n";
LRGraphicsObjectElement parentPageObj = element.getParentGraphicsObjectElement();
boolean bIsEmpty = parentPageObj.isEmpty();
writeTextWithPrefix(doc, depth, "\t", outputStr);
GraphicsObject pageObj = element.getGraphicsObject();
int type = 0;
if (pageObj != null)
type = pageObj.getType();
RectF rcBox = element.getBBox();
PDFDictionary pDict = element.getDict();
int objIndex = element.getGraphicsObjectIndex();
Matrix2D matrix = element.getMatrix();
String szVal = "";
szVal = String.format("BBox: [%.1f,%.1f,%.1f,%.1f]\r\n", rcBox.getLeft(), rcBox.getTop(), rcBox.getRight(), rcBox.getBottom());
writeTextWithPrefix(doc, depth, "\t", szVal);
szVal = String.format("Matrix: [%.1f,%.1f,%.1f,%.1f,%.1f,%.1f]\r\n", matrix.getA(), matrix.getB(), matrix.getC(), matrix.getD(), matrix.getE(), matrix.getF());
writeTextWithPrefix(doc, depth, "\t", szVal);
szVal = String.format("PageObjectIndex: %d\r\n", objIndex);
writeTextWithPrefix(doc, depth, "\t", szVal);
String text = (type == (int)GraphicsObject.e_TypeText) ? pageObj.getTextObject().getText() : "";
if (text.length() > 0) {
szVal = "Text: ";
szVal += text;
szVal += "\r\n";
writeTextWithPrefix(doc, depth, "\t", szVal);
}
}
//Output the content element.
private static void OutputLRContentElement(LRContentElement element, OutputStreamWriter doc, int depth) throws PDFException, IOException {
if (element.isEmpty())
return;
LRGraphicsObjectElement pageObj = element.getGraphicsObjectElement();
boolean bIsEmpty = pageObj.isEmpty();
LRStructureElement parentPageObj = element.getParentElement();
bIsEmpty = parentPageObj.isEmpty();
int elemType = element.getElementType();
String elementTypeStr = LRElement.stringifyElementType(elemType);
String outputStr = "< LRContentElement: " + elementTypeStr + " >\r\n";
writeTextWithPrefix(doc, depth, "\t", outputStr);
RectF rcBox = element.getBBox();
int nStartPos = 0, nLength = 0;
Range range = element.getGraphicsObjectRange();
if (!range.isEmpty()) {
nStartPos = range.getSegmentStart(0);
nLength = range.getSegmentEnd(0) - nStartPos + 1;
}
Matrix2D matrix = element.getMatrix();
String szVal = "";
szVal = String.format("BBox: [%.1f,%.1f,%.1f,%.1f]\r\n", rcBox.getLeft(), rcBox.getTop(), rcBox.getRight(), rcBox.getBottom());
writeTextWithPrefix(doc, depth, "\t", szVal);
szVal = String.format("Matrix: [%.1f,%.1f,%.1f,%.1f,%.1f,%.1f]\r\n", matrix.getA(), matrix.getB(), matrix.getC(), matrix.getD(), matrix.getE(), matrix.getF());
writeTextWithPrefix(doc, depth, "\t", szVal);
szVal = String.format("StartPos: %d\r\n", nStartPos);
writeTextWithPrefix(doc, depth, "\t", szVal);
szVal = String.format("Length: %d\r\n", nLength);
writeTextWithPrefix(doc, depth, "\t", szVal);
// If content element has the graphics object element, output the graphics object element.
if (!bIsEmpty)
OutputLRGraphicsObjectElement(pageObj, doc, depth + 1);
}
//Show the element information of the root element.
private static void showLRElementInfo(LRStructureElement element, OutputStreamWriter doc, int depth) throws PDFException, IOException {
int nElemListSize = element.getChildCount();
OutputLRStructureElement(element, doc, depth);
for (int i=0; i<nElemListSize; i++)
{
//Get child of the structure element.
LRElement item = element.getChild(i);
int item_type = item.getElementType();
if (item.isStructureElement())
{
//Continue to get child of the structure element.
LRStructureElement srt_item = new LRStructureElement(item);
showLRElementInfo(srt_item, doc, depth + 1);
} else if (item.isContentElement())
{
LRContentElement srt_item = new LRContentElement(item);
OutputLRContentElement(srt_item, doc, depth);
} else if (item.isGraphicsObjectElement())
{
LRGraphicsObjectElement srt_item = new LRGraphicsObjectElement(item);
OutputLRGraphicsObjectElement(srt_item, doc, depth);
}
}
}
//Identify the page.
private static void IdentityPage(PDFPage page, String info_path) throws PDFException, IOException {
FileOutputStream fos = new FileOutputStream(info_path);
OutputStreamWriter text_doc = new OutputStreamWriter(fos, "UTF-8");
LRContext context = new LRContext(page);
Progressive progressive = context.startParse(null);
LRStructureElement root = context.getRootElement();
if (!root.isEmpty()) {
showLRElementInfo(root, text_doc, 0);
} else {
text_doc.write("No layout recognition information!\r\n");
}
text_doc.flush();
text_doc.close();
fos.close();
}
Objective-C:
void writeTextWithPrefix(NSFileHandle* doc, int depth, NSString* prefix, NSString* content) {
for (int i = 0; i < depth; i++) {
NSData* content_data = [prefix dataUsingEncoding:NSUTF8StringEncoding];
[doc writeData:content_data];
}
NSData* content_data = [content dataUsingEncoding:NSUTF8StringEncoding];
[doc writeData:content_data];
}
NSString* lr_Format(BOOL val) {
return val ? @"True" : @"False";
}
NSString* lr_Format(int val) {
return [NSString stringWithFormat:@"%d", val];
}
NSString* lr_Format(float val) {
return [NSString stringWithFormat:@"%.1f", val];
}
NSString* lr_FormatARGB(long val) {
return [NSString stringWithFormat:@"#%02X%02X%02X", (char)(val >> 16), (char)(val >> 8), (char)(val)];
}
NSString* lr_FormatAttributeValueEnum(FSLRStructureElementAttributeValueEnum val) {
return [FSLRStructureElement stringifyEnumVariant:val];
}
//Ouput the structure element.
void OutputLRStructureElement(FSLRStructureElement* element, NSFileHandle* doc, int depth) {
if ([element isEmpty])
return;
FSLRStructureElement* parentElement = [element getParentElement];
BOOL bIsEmpty = [parentElement isEmpty];
FSLRElementElementType elemType = [element getElementType];
NSString* elementTypeStr = [FSLRStructureElement stringifyElementType:elemType];
NSString* outputStr = [NSString stringWithFormat:@"< StructureElement: %@ >\r\n", elementTypeStr];
writeTextWithPrefix(doc, depth, @"\t", outputStr);
int nSize = [element getSupportedAttributeCount];
for (int i = 0; i < nSize; i++) {
FSLRStructureElementAttributeType attrType = [element getSupportedAttribute:i];
FSLRStructureElementAttributeValueType attrValueType = [element getAttributeValueType:attrType];
int nLength = 0;
BOOL bIsArray = [FSLRStructureElement isArrayAttributeValueType:attrValueType];
nLength = [element getAttributeValueCount:attrType];
if (attrValueType == FSLRStructureElementAttributeValueTypeEmpty)
continue;
NSString* szKey = [FSLRStructureElement stringifyAttributeType:attrType];
NSString* szVal = @"";
if (bIsArray)
szVal = [szVal stringByAppendingString:@"["];
for (int idx = 0; idx < nLength; idx++) {
if (idx > 0)
szVal = [szVal stringByAppendingString:@", "];
switch (attrValueType) {
case FSLRStructureElementAttributeValueTypeEnum:
case FSLRStructureElementAttributeValueTypeEnumArray:
szVal = [szVal
stringByAppendingString:lr_FormatAttributeValueEnum([element getAttributeValueEnum:attrType index:idx])];
break;
case FSLRStructureElementAttributeValueTypeInt32:
case FSLRStructureElementAttributeValueTypeInt32Array:
szVal = [szVal stringByAppendingString:lr_Format([element getAttributeValueInt32:attrType index:idx])];
break;
case FSLRStructureElementAttributeValueTypeFloat:
case FSLRStructureElementAttributeValueTypeFloatArray:
szVal = [szVal stringByAppendingString:lr_Format([element getAttributeValueFloat:attrType index:idx])];
break;
case FSLRStructureElementAttributeValueTypeARGB:
case FSLRStructureElementAttributeValueTypeARGBArray:
szVal = [szVal stringByAppendingString:lr_FormatARGB([element getAttributeValueARGB:attrType index:idx])];
break;
case FSLRStructureElementAttributeValueTypeWStr:
case FSLRStructureElementAttributeValueTypeWStrArray:
szVal = [szVal stringByAppendingString:[element getAttributeValueString:attrType index:idx]];
break;
}
}
if (bIsArray)
szVal = [szVal stringByAppendingString:@"]"];
outputStr = [NSString stringWithFormat:@"%@: %@\r\n", szKey, szVal];
writeTextWithPrefix(doc, depth, @"\t", outputStr);
}
}
//Output the graphics object element.
void OutputLRGraphicsObjectElement(FSLRGraphicsObjectElement* element, NSFileHandle* doc, int depth) {
if ([element isEmpty])
return;
NSString* outputStr = @"< LRGraphicsObjectElement: >\r\n";
FSLRGraphicsObjectElement* parentPageObj = [element getParentGraphicsObjectElement];
BOOL bIsEmpty = [parentPageObj isEmpty];
writeTextWithPrefix(doc, depth, @"\t", outputStr);
FSGraphicsObject* pageObj = [element getGraphicsObject];
FSGraphicsObjectType type = 0;
if (pageObj != nil)
type = [pageObj getType];
FSRectF* rcBox = [element getBBox];
FSPDFDictionary* pDict = [element getDict];
int objIndex = [element getGraphicsObjectIndex];
FSMatrix2D* matrix = [element getMatrix];
NSString* szVal = @"";
szVal = [NSString stringWithFormat:@"BBox: [%.1f,%.1f,%.1f,%.1f]\r\n", [rcBox getLeft], [rcBox getTop], [rcBox getRight],
[rcBox getBottom]];
writeTextWithPrefix(doc, depth, @"\t", szVal);
szVal = [NSString stringWithFormat:@"Matrix: [%.1f,%.1f,%.1f,%.1f,%.1f,%.1f]\r\n", [matrix getA], [matrix getB], [matrix getC],
[matrix getD], [matrix getE], [matrix getF]];
writeTextWithPrefix(doc, depth, @"\t", szVal);
szVal = [NSString stringWithFormat:@"PageObjectIndex: %d\r\n", objIndex];
writeTextWithPrefix(doc, depth, @"\t", szVal);
NSString* text = (type == FSGraphicsObjectTypeText) ? [[pageObj getTextObject] getText] : @"";
if ([text length] > 0) {
szVal = [NSString stringWithFormat:@"Text: %@\r\n", text];
writeTextWithPrefix(doc, depth, @"\t", szVal);
}
}
//Output the content element.
void OutputLRContentElement(FSLRContentElement* element, NSFileHandle* doc, int depth) {
if ([element isEmpty])
return;
FSLRGraphicsObjectElement* pageObj = [element getGraphicsObjectElement];
BOOL bIsEmpty = [pageObj isEmpty];
FSLRStructureElement* parentPageObj = [element getParentElement];
bIsEmpty = [parentPageObj isEmpty];
FSLRElementElementType elemType = [element getElementType];
NSString* elementTypeStr = [FSLRElement stringifyElementType:elemType];
NSString* outputStr = [NSString stringWithFormat:@"< LRContentElement: %@ >\r\n", elementTypeStr];
writeTextWithPrefix(doc, depth, @"\t", outputStr);
FSRectF* rcBox = [element getBBox];
int nStartPos = 0, nLength = 0;
FSRange* range = [element getGraphicsObjectRange];
if (![range isEmpty]) {
nStartPos = [range getSegmentStart:0];
nLength = [range getSegmentEnd:0] - nStartPos + 1;
}
FSMatrix2D* matrix = [element getMatrix];
NSString* szVal = @"";
szVal = [NSString stringWithFormat:@"BBox: [%.1f,%.1f,%.1f,%.1f]\r\n", [rcBox getLeft], [rcBox getTop], [rcBox getRight],
[rcBox getBottom]];
writeTextWithPrefix(doc, depth, @"\t", szVal);
szVal = [NSString stringWithFormat:@"Matrix: [%.1f,%.1f,%.1f,%.1f,%.1f,%.1f]\r\n", [matrix getA], [matrix getB], [matrix getC],
[matrix getD], [matrix getE], [matrix getF]];
writeTextWithPrefix(doc, depth, @"\t", szVal);
szVal = [NSString stringWithFormat:@"StartPos: %d\r\n", nStartPos];
writeTextWithPrefix(doc, depth, @"\t", szVal);
szVal = [NSString stringWithFormat:@"Length: %d\r\n", nLength];
writeTextWithPrefix(doc, depth, @"\t", szVal);
//If content element has the graphics object element, output the graphics object element.
if (!bIsEmpty)
OutputLRGraphicsObjectElement(pageObj, doc, depth + 1);
}
//Show the element information of the root element.
void showLRElementInfo(FSLRStructureElement* element, NSFileHandle* doc, int depth) {
int nElemListSize = [element getChildCount];
OutputLRStructureElement(element, doc, depth);
//Get child of the structure element.
for (int i = 0; i < nElemListSize; i++) {
FSLRElement* item = [element getChild:i];
FSLRElementElementType item_type = [item getElementType];
if ([item isStructureElement]) {
//Continue to get child of the structure element.
FSLRStructureElement* srt_item = [[FSLRStructureElement alloc] initWithElement:item];
showLRElementInfo(srt_item, doc, depth + 1);
} else if ([item isContentElement]) {
FSLRContentElement* srt_item = [[FSLRContentElement alloc] initWithElement:item];
OutputLRContentElement(srt_item, doc, depth);
} else if ([item isGraphicsObjectElement]) {
FSLRGraphicsObjectElement* srt_item = [[FSLRGraphicsObjectElement alloc] initWithElement:item];
OutputLRGraphicsObjectElement(srt_item, doc, depth);
}
}
}
//Identify the page.
void IdentifyPage(FSPDFPage* page, NSString* info_path) {
NSFileManager* file_manager = [NSFileManager defaultManager];
if (![file_manager createFileAtPath:info_path contents:nil attributes:nil]) {
NSLog(@"Fail to create txt file %@\r\n", info_path);
return;
}
NSFileHandle* text_doc = [NSFileHandle fileHandleForWritingAtPath:info_path];
FSLRContext* context = [[FSLRContext alloc] initWithPage:page];
FSProgressive* progressive = [context startParse:nil];
FSLRStructureElement* root = [context getRootElement];
if (![root isEmpty]) {
showLRElementInfo(root, text_doc, 0);
} else {
NSString* record_content = @"No layout recognition information!\r\n";
NSData* content_data = [record_content dataUsingEncoding:NSUTF8StringEncoding];
[text_doc writeData:content_data];
}
[text_doc closeFile];
}
C:
string wstring2string(const wstring wstr) {
std::string curLocale = setlocale(LC_ALL, NULL);
setlocale(LC_ALL, "chs");
size_t _Dsize = 2 * wstr.size() + 1;
char *_Dest = new char[_Dsize];
memset(_Dest, 0, _Dsize);
wcstombs(_Dest, wstr.c_str(), _Dsize);
std::string result = _Dest;
delete[]_Dest;
setlocale(LC_ALL, curLocale.c_str());
return result;
}
//The class used for writing text.
class TextDoc {
public:
TextDoc(const std::string& file_name, const std::string& fill_mode);
TextDoc(const wstring& file_name, const wstring& fill_mode);
~TextDoc();
void Write(const char* text_content);
void Write(const wchar_t* text_content);
void Write(int count, const wchar_t* prefix, const wchar_t* text_content);
private:
FILE * file_;
};
TextDoc::TextDoc(const std::string& file_name, const std::string& file_mode) throw(std::exception)
: file_(NULL) {
fopen_s(&file_, file_name.c_str(), (const char*)file_mode.c_str());
FS_UINT8 temp[] = { 0xFF, 0xFE };
fwrite(temp, sizeof(FS_UINT8), 2, file_);
fseek(file_, 0, SEEK_END);
}
TextDoc::TextDoc(const wstring& file_name, const wstring& file_mode) throw(std::exception)
: file_(NULL) {
std::string s_file_name = wstring2string(file_name);
std::string s_file_mode = wstring2string(file_mode);
fopen_s(&file_, (const char*)s_file_name.c_str(), (const char*)s_file_mode.c_str());
FS_UINT8 temp[] = { 0xFF, 0xFE };
fwrite(temp, sizeof(FS_UINT8), 2, file_);
fseek(file_, 0, SEEK_END);
}
TextDoc::~TextDoc() {
fclose(file_);
file_ = NULL;
}
void TextDoc::Write(const char* text_content) {
wstring wstr;
wstr = string2wstring(text_content);
Write(wstr.c_str());
}
void TextDoc::Write(const wchar_t* text_content) {
FS_WSTR wide_str;
int len = wcslen(text_content) * sizeof(wchar_t) + 1;
FSDK_WStr_Init(wide_str);
FSDK_WStr_SetLength(wide_str, len);
FSDK_WStr_Set(wide_str, text_content, len);
if (wide_str.str == NULL)return;
FS_BSTR utf16le_str;
FSDK_BStr_Init(utf16le_str);
FSDK_WStr_UTF16LEEncode(wide_str, utf16le_str);
FSDK_WStr_Clear(wide_str);
if (utf16le_str.str == NULL)return;
int length = utf16le_str.len;
fwrite((const char*)utf16le_str.str, sizeof(char), length, file_);
FSDK_BStr_Clear(utf16le_str);
}
void TextDoc::Write(int count, const wchar_t* prefix, const wchar_t* text_content) {
for (int i = 0; i < count; i++) {
Write(prefix);
}
Write(text_content);
}
std::wstring LR_Format(bool val) {
return val ? L"True" : L"False";
}
std::wstring LR_Format(FS_INT32 val) {
char buffer[256] = { 0 };
sprintf(buffer, "%d", val);
std::wstring s;
s += string2wstring(buffer);
return s;
}
std::wstring LR_Format(float val) {
char buffer[256] = { 0 };
sprintf(buffer, "%.1f", val);
std::wstring s;
s += string2wstring(buffer);
return s;
}
std::wstring LR_Format(const wchar_t* val) {
return val;
}
std::wstring LR_Format(FS_ARGB val) {
char buffer[256] = { 0 };
sprintf(buffer, "#%02X%02X%02X", (FS_UINT8)(val >> 16), (FS_UINT8)(val >> 8), (FS_UINT8)(val));
std::string strbuffer = buffer;
return string2wstring(strbuffer);
}
std::wstring LR_Format(FSAttributeValueEnum val) {
FS_BSTR variant;
FSDK_BStr_Init(variant);
FSDK_LRStructureElement_StringifyEnumVariant(val, variant);
std::string s;
s = variant.str;
std::wstring result = string2wstring(s);
FSDK_BStr_Clear(variant);
return result;
}
//Output the structure element.
void OutputLRStructureElement(FS_LRSTRUCTUREELEMENT_HANDLE element, TextDoc& doc, int depth) {
if (depth > 32)
return;
if (element == NULL)
return;
FS_LRSTRUCTUREELEMENT_HANDLE parent_element;
FSDK_LRStructureElement_GetParentElement(element, parent_element);
bool is_empty = parent_element == NULL ? true : false;
FSElementType elem_type;
FSDK_LRElement_GetElementType(element, elem_type);;
FS_BSTR element_type_str;
FSDK_LRElement_StringifyElementType(elem_type, element_type_str);
wstring output_str = L"< StructureElement: " + string2wstring(element_type_str.str) + L" >\r\n";
doc.Write(depth, L"\t", output_str.c_str());
FS_INT32 size;
FSDK_LRStructureElement_GetSupportedAttributeCount(element, size);
for (int i = 0; i < size; i++) {
FSAttributeType attr_type;
FSDK_LRStructureElement_GetSupportedAttribute(element, i, attr_type);
FSAttributeValueType attr_value_type;
FSDK_LRStructureElement_GetAttributeValueType(element, attr_type, attr_value_type);
int length = 0;
FS_BOOL is_array;
FSDK_LRStructureElement_IsArrayAttributeValueType(attr_value_type, is_array);
FSDK_LRStructureElement_GetAttributeValueCount(element, attr_type, length);
if (attr_value_type == e_FSAttributeValueTypeEmpty)
continue;
FS_BSTR key;
FSDK_LRStructureElement_StringifyAttributeType(attr_type, key);
wstring sz_val = L"";
if (is_array)
sz_val += L"[";
for (int idx = 0; idx < length; idx++) {
if (idx)
sz_val += L", ";
switch (attr_value_type) {
case e_FSAttributeValueTypeEnum:
case e_FSAttributeValueTypeEnumArray:
FSAttributeValueEnum attr_value_enum;
FSDK_LRStructureElement_GetAttributeValueEnum(element, attr_type, idx, attr_value_enum);
sz_val += LR_Format(attr_value_enum);
break;
case e_FSAttributeValueTypeInt32:
case e_FSAttributeValueTypeInt32Array:
FS_INT32 attr_value_int32;
FSDK_LRStructureElement_GetAttributeValueInt32(element, attr_type, idx, attr_value_int32);
sz_val += LR_Format(attr_value_int32);
break;
case e_FSAttributeValueTypeFloat:
case e_FSAttributeValueTypeFloatArray:
float attr_value_float;
FSDK_LRStructureElement_GetAttributeValueFloat(element, attr_type, idx, attr_value_float);
sz_val += LR_Format(attr_value_float);
break;
case e_FSAttributeValueTypeARGB:
case e_FSAttributeValueTypeARGBArray:
FS_ARGB attr_value_argb;
FSDK_LRStructureElement_GetAttributeValueARGB(element, attr_type, idx, attr_value_argb);
sz_val += LR_Format(attr_value_argb);
break;
case e_FSAttributeValueTypeWStr:
case e_FSAttributeValueTypeWStrArray:
FS_WSTR str;
FSDK_LRStructureElement_GetAttributeValueString(element, attr_type, idx, str);
sz_val += LR_Format(wstring(str.str).c_str());
break;
default:
break;
}
}
if (is_array)
sz_val += L"]";
output_str = string2wstring(key.str) + L": " + sz_val + L"\r\n";
doc.Write(depth, L"\t", output_str.c_str());
}
}
//Output the graphics object element.
void OutputLRGraphicsObjectElement(FS_LRGRAPHICSOBJECTELEMENT_HANDLE element, TextDoc& doc, int depth) {
if (element == NULL)
return;
wstring output_str = L"< LRGraphicsObjectElement: >\r\n";
FS_LRGRAPHICSOBJECTELEMENT_HANDLE parent_pageobj;
FSDK_LRGraphicsObjectElement_GetParentGraphicsObjectElement(element, parent_pageobj);
bool is_empty = parent_pageobj == NULL;
doc.Write(depth, L"\t", output_str.c_str());
FS_GRAPHICSOBJECT_HANDLE page_obj;
FSDK_LRGraphicsObjectElement_GetGraphicsObject(element, page_obj);
FSGraphicsObjectType type = e_FSTypeAll;
if (page_obj != NULL) FSDK_GraphicsObject_GetType(page_obj, type);
FSRectF rc_box;
FSDK_LRGraphicsObjectElement_GetBBox(element, rc_box);
FS_PDFDICTIONARY_HANDLE dict;
FSDK_LRGraphicsObjectElement_GetDict(element, dict);
int obj_index;
FSDK_LRGraphicsObjectElement_GetGraphicsObjectIndex(element, obj_index);
FSMatrix matrix;
FSDK_LRGraphicsObjectElement_GetMatrix(element, matrix);
char buffer[256] = { 0 };
sprintf(buffer, "BBox: [%.1f,%.1f,%.1f,%.1f]\r\n", rc_box.left, rc_box.top, rc_box.right, rc_box.bottom);
doc.Write(depth, L"\t", string2wstring(buffer).c_str());
sprintf(buffer, "Matrix: [%.1f,%.1f,%.1f,%.1f,%.1f,%.1f]\r\n", matrix.a, matrix.b, matrix.c, matrix.d, matrix.e,
matrix.f);
doc.Write(depth, L"\t", string2wstring(buffer).c_str());
sprintf(buffer, "PageObjectIndex: %d\r\n", obj_index);
doc.Write(depth, L"\t", string2wstring(buffer).c_str());
FSGraphicsObjectType graphics_type;
FSDK_GraphicsObject_GetType(page_obj, graphics_type);
if (graphics_type != e_FSTypeText) return;
wstring text;
FS_WSTR textstr;
FSDK_TextObject_GetText(page_obj, textstr);
text = wstring(textstr.str);
FSDK_WStr_Clear(textstr);
if (!text.empty()) {
wstring str = L"Text: ";
str += text;
str += L"\r\n";
doc.Write(depth, L"\t", str.c_str());
}
}
//Output the content element.
void OutputLRContentElement(FS_LRCONTENTELEMENT_HANDLE element, TextDoc& doc, int depth) {
if (element == NULL)
return;
FS_LRGRAPHICSOBJECTELEMENT_HANDLE page_object;
FSDK_LRContentElement_GetGraphicsObjectElement(element, page_object);
bool is_empty = page_object == NULL ? true : false;
FS_LRSTRUCTUREELEMENT_HANDLE parent_page_object;
FSDK_LRContentElement_GetParentElement(element, parent_page_object);
FSElementType elem_type;
FSDK_LRElement_GetElementType(element, elem_type);
FS_BSTR element_type_str;
FSDK_LRElement_StringifyElementType(elem_type, element_type_str);
wstring output_str = L"< LRContentElement: " + string2wstring(element_type_str.str) + L" >\r\n";
doc.Write(depth, L"\t", output_str.c_str());
FSRectF rc_box;
FSDK_LRContentElement_GetBBox(element, rc_box);
int start_pos = 0, length = 0;
FS_RANGE_HANDLE range_handle;
FSDK_LRContentElement_GetGraphicsObjectRange(element, range_handle);
if (range_handle != NULL) {
FSDK_Range_GetSegmentStart(range_handle, 0, start_pos);
int segment_end = 0;
FSDK_Range_GetSegmentEnd(range_handle, 0, segment_end);
length = segment_end - start_pos + 1;
}
FSMatrix matrix;
FSDK_LRContentElement_GetMatrix(element, matrix);
char temp[256] = { 0 };
sprintf(temp, "BBox: [%.1f,%.1f,%.1f,%.1f]\r\n", rc_box.left, rc_box.top, rc_box.right, rc_box.bottom);
doc.Write(depth, L"\t", string2wstring(temp).c_str());
sprintf(temp, "Matrix: [%.1f,%.1f,%.1f,%.1f,%.1f,%.1f]\r\n", matrix.a, matrix.b, matrix.c, matrix.d, matrix.e,
matrix.f);
doc.Write(depth, L"\t", string2wstring(temp).c_str());
sprintf(temp, "StartPos: %d\r\n", start_pos);
doc.Write(depth, L"\t", string2wstring(temp).c_str());
sprintf(temp, "Length: %d\r\n", length);
doc.Write(depth, L"\t", string2wstring(temp).c_str());
//If content element has the graphics object element, output the graphics object element.
if (!is_empty)
OutputLRGraphicsObjectElement(page_object, doc, depth + 1);
}
//Show the element information of the root element.
void ShowLRElementInfo(FS_LRSTRUCTUREELEMENT_HANDLE element, TextDoc& doc, int depth) {
OutputLRStructureElement(element, doc, depth);
int elem_list_size;
FSDK_LRStructureElement_GetChildCount(element, elem_list_size);
for (int i = 0; i < elem_list_size; i++) {
//Get child of the structure element.
FS_LRELEMENT_HANDLE item;
FSDK_LRStructureElement_GetChild(element, i, item);
FSElementType item_type;
FSDK_LRElement_GetElementType(item, item_type);
FS_BOOL is_structure_element = 0;
FSDK_LRElement_IsStructureElement(item, is_structure_element);
FS_BOOL is_content_element = 0;
FSDK_LRElement_IsContentElement(item, is_content_element);
FS_BOOL is_graphicsobject_element = 0;
FSDK_LRElement_IsGraphicsObjectElement(item, is_graphicsobject_element);
if (is_structure_element == 1) {
//Countinue to get child of the structure element.
FS_LRSTRUCTUREELEMENT_HANDLE src_item;
FSDK_LRStructureElement_Create0(item, src_item);
ShowLRElementInfo(src_item, doc, depth + 1);
FSDK_LRStructureElement_Release(src_item);
} else if (is_content_element == 1) {
FS_LRCONTENTELEMENT_HANDLE src_item;
FSDK_LRContentElement_Create0(item, src_item);
OutputLRContentElement(src_item, doc, depth);
FSDK_LRContentElement_Release(src_item);
} else if (is_graphicsobject_element == 1) {
FS_LRGRAPHICSOBJECTELEMENT_HANDLE src_item;
FSDK_LRGraphicsObjectElement_Create0(item, src_item);
OutputLRGraphicsObjectElement(item, doc, depth);
FSDK_LRGraphicsObjectElement_Release(src_item);
}
}
}
//Identify the page.
void IdentifyPage(FS_PDFPAGE_HANDLE page, const wstring& info_path) {
TextDoc text_doc(info_path, L"w+b");
FS_LRCONTEXT_HANDLE context;
FSDK_LRContext_Create(page, context);
FS_PROGRESSIVE_HANDLE progressive;
FSDK_LRContext_StartParse(context, NULL, progressive);
FS_LRSTRUCTUREELEMENT_HANDLE root;
FSDK_LRContext_GetRootElement(context, root);
if (root != NULL) {
ShowLRElementInfo(root, text_doc, 0);
}
else {
text_doc.Write("No layout recognition information!\r\n");
}
}
Updated on September 2, 2022