Loading...
Searching...
No Matches
TextExtractor.h
Go to the documentation of this file.
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5#ifndef PDFTRON_H_CPPPDFTextExtractor
6#define PDFTRON_H_CPPPDFTextExtractor
7
8#include <PDF/Page.h>
9#include <PDF/Rect.h>
10#include <Common/UString.h>
11#include <C/PDF/TRN_TextExtractor.h>
12#include <vector>
13#include <PDF/Highlights.h>
14#include<PDF/OCG/Context.h>
15
16namespace pdftron {
17 namespace PDF {
18
19class Style;
20class Word;
21class Line;
22
27{
28 int index; // character index
29 int length; // character length
30};
31
117{
118public:
122
128
134 {
135 // Disables expanding of ligatures using a predefined mapping.
136 // Default ligatures are: fi, ff, fl, ffi, ffl, ch, cl, ct, ll,
137 // ss, fs, st, oe, OE.
139
140 // Disables removing duplicated text that is frequently used to
141 // achieve visual effects of drop shadow and fake bold.
143
144 // Treat punctuation (e.g. full stop, comma, semicolon, etc.) as
145 // word break characters.
147
148 // Enables removal of text that is obscured by images or
149 // rectangles. Since this option has small performance penalty
150 // on performance of text extraction, by default it is not
151 // enabled.
153
154 // Enables removing text that uses rendering mode 3 (i.e. invisible text).
155 // Invisible text is usually used in 'PDF Searchable Images' (i.e. scanned
156 // pages with a corresponding OCR text). As a result, invisible text
157 // will be extracted by default.
159
160 // Enables removal of text that is marked as part of a Watermark layer
162
163 // Use Z-order as reading order for text
165 };
166
176 void Begin(Page page, const Rect* clip_ptr = 0, UInt32 flags = 0);
177
189
194
200 void SetRightToLeftLanguage(bool rtl);
204
219 UString GetAsText(bool dehyphen = true);
220
221#ifndef SWIG
222 void GetAsText(UString& out_str, bool dehyphen = true);
223#endif
224
231
232#ifndef SWIG
233 void GetTextUnderAnnot(UString& out_str, const Annot& annot);
234#endif
235
236
241 {
242 // Output words as XML elements instead of inline text.
244
245 // Include bounding box information for each XML element.
246 // The bounding box information will be stored as 'bbox' attribute.
248
249 // Include font and styling information.
251 };
252
295 UString GetAsXML(UInt32 xml_output_flags = 0);
296
297#ifndef SWIG
298 void GetAsXML(UString& out_xml, UInt32 xml_output_flags = 0);
299#endif
300
306 Highlights GetHighlights(const std::vector<CharRange>& char_ranges);
307
308#ifndef SWIG
315 Highlights GetHighlights(const CharRange* char_ranges, size_t char_ranges_count);
316#endif
317
321 int GetNumLines();
322
323
331
335 void Destroy();
336
337 // @cond PRIVATE_DOC
338private:
339 TRN_TextExtractor mp_extractor;
340
341 // TextExtractor should not be copied
342 TextExtractor(const TextExtractor& other);
343 TextExtractor& operator= (const TextExtractor&);
344 // @endcond
345};
346
352class Style
353{
354public:
355
363
368
377 double GetFontSize();
378
388
393 bool IsItalic();
394
400 bool IsSerif();
401
405 std::vector<int> GetColor();
406
407#ifndef SWIG
408 void GetColor(UInt8 rgb[3]);
409#endif
410
411 bool operator== (const Style& s) const;
412 bool operator!= (const Style& s) const;
413
415
416 // @cond PRIVATE_DOC
417 #ifndef SWIGHIDDEN
418 Style(const Style& s);
419 Style(TRN_TextExtractorStyle impl);
420 TRN_TextExtractorStyle mp_style;
421 #endif
422 // @endcond
423};
424
430class Word
431{
432public:
437
445
446#ifndef SWIG
447 void GetBBox(double out_bbox[4]);
448#endif
449
454 std::vector<double> GetQuad();
455
456#ifndef SWIG
457 void GetQuad(double out_quad[8]);
458#endif
459
465 std::vector<double> GetGlyphQuad(int glyph_idx);
466
467#ifndef SWIG
468 void GetGlyphQuad(int glyph_idx, double out_quad[8]);
469#endif
470
475 Style GetCharStyle(int char_idx);
476
481
486
490#ifdef SWIG
492#else
494#endif
495
500
507
511 bool IsValid();
512
513 bool operator== (const Word&) const;
514 bool operator!= (const Word&) const;
516
517 // @cond PRIVATE_DOC
518 #ifndef SWIGHIDDEN
519 Word(TRN_TextExtractorWord impl);
520 TRN_TextExtractorWord mp_word;
521 #endif
522 // @endcond
523};
524
530class Line {
531public:
532
537
543
550#ifdef SWIG
551 Rect GetBBox();
552#else
553 const double* GetBBox();
554#endif
555
560 std::vector<double> GetQuad();
561
562#ifndef SWIG
567 void GetQuad(double out_quad[8]);
568#endif
569
575
580 Word GetWord(int word_idx);
581
586
591
596
603
610
616
620 bool IsValid();
621
622 bool operator== (const Line&) const;
623 bool operator!= (const Line&) const;
625
626 // @cond PRIVATE_DOC
627 #ifndef SWIGHIDDEN
628 Line(TRN_TextExtractorLine impl);
629 TRN_TextExtractorLine mp_line;
630 #endif
631 // @endcond
632};
633
634
635
636
637#include <Impl/TextExtractor.inl>
638
639 }; // namespace PDF
640}; // namespace pdftron
641
642#endif // PDFTRON_H_CPPPDFTextExtractor
const double * GetBBox()
std::vector< double > GetQuad()
bool operator==(const Line &) const
void GetQuad(double out_quad[8])
bool operator!=(const Line &) const
Word GetWord(int word_idx)
void GetColor(UInt8 rgb[3])
bool operator!=(const Style &s) const
std::vector< int > GetColor()
bool operator==(const Style &s) const
UString GetAsXML(UInt32 xml_output_flags=0)
Highlights GetHighlights(const std::vector< CharRange > &char_ranges)
pdftron::PDF::Style Style
void SetOCGContext(OCG::Context *ctx)
UString GetTextUnderAnnot(const Annot &annot)
void GetAsText(UString &out_str, bool dehyphen=true)
void Begin(Page page, const Rect *clip_ptr=0, UInt32 flags=0)
void GetTextUnderAnnot(UString &out_str, const Annot &annot)
void SetRightToLeftLanguage(bool rtl)
UString GetAsText(bool dehyphen=true)
Style GetCharStyle(int char_idx)
std::vector< double > GetGlyphQuad(int glyph_idx)
std::vector< double > GetQuad()
const Unicode * GetString()
void GetGlyphQuad(int glyph_idx, double out_quad[8])
void GetQuad(double out_quad[8])
void GetBBox(double out_bbox[4])
bool operator!=(const Word &) const
bool operator==(const Word &) const
TRN_UInt32 UInt32
Definition BasicTypes.h:13
TRN_UInt8 UInt8
Definition BasicTypes.h:15
TRN_Unicode Unicode
Definition BasicTypes.h:22