TextLinearizationConfig

class textractor.data.text_linearization_config.TextLinearizationConfig(remove_new_lines_in_leaf_elements: bool = True, max_number_of_consecutive_new_lines: int = 2, hide_header_layout: bool = False, hide_footer_layout: bool = False, hide_figure_layout: bool = False, hide_table_layout: bool = False, hide_page_num_layout: bool = False, page_num_prefix: str = '', page_num_suffix: str = '', same_paragraph_separator: str = ' ', same_layout_element_separator: str = '\n', layout_element_separator: str = '\n\n', list_element_separator: str = '\n', list_layout_prefix: str = '', list_layout_suffix: str = '', list_element_prefix: str = '', list_element_suffix: str = '', title_prefix: str = '', title_suffix: str = '', table_layout_prefix: str = '\n\n', table_layout_suffix: str = '\n', table_remove_column_headers: bool = False, table_column_header_threshold: float = 0.9, table_linearization_format: str = 'plaintext', table_tabulate_format: str = 'github', table_min_table_words: int = 0, table_column_separator: str = '\t', table_prefix: str = '', table_suffix: str = '', table_row_separator: str = '\n', table_row_prefix: str = '', table_row_suffix: str = '', table_cell_prefix: str = '', table_cell_suffix: str = '', table_cell_header_prefix: str = '', table_cell_header_suffix: str = '', table_cell_empty_cell_placeholder: str = '', header_prefix: str = '', header_suffix: str = '', section_header_prefix: str = '', section_header_suffix: str = '', text_prefix: str = '', text_suffix: str = '', key_value_layout_prefix: str = '', key_value_layout_suffix: str = '', key_value_prefix: str = '', key_value_suffix: str = '', key_prefix: str = '', key_suffix: str = ' ', value_prefix: str = '', value_suffix: str = '', selection_element_selected: str = '[X]', selection_element_not_selected: str = '[ ]', heuristic_h_tolerance: float = 0.3, heuristic_line_break_threshold: float = 0.9, heuristic_overlap_ratio: float = 0.5, signature_token: str = '[SIGNATURE]', add_prefixes_and_suffixes_as_words: bool = False, add_prefixes_and_suffixes_in_text: bool = True)

Bases: object

The TextLinearizationConfig object defines how a document is linearized into a text string

add_prefixes_and_suffixes_as_words: bool = False

Controls if the prefixes/suffixes will be inserted in the words returned by get_text_and_words

add_prefixes_and_suffixes_in_text: bool = True

Controls if the prefixes/suffixes will be added to the linearized text

header_prefix: str = ''

Prefix for header layout elements

header_suffix: str = ''

Suffix for header layout elements

heuristic_h_tolerance: float = 0.3

How much the line below and above the current line should differ in width to be separated

heuristic_line_break_threshold: float = 0.9

How much space is acceptable between two lines before splitting them. Expressed in multiple of min heights

heuristic_overlap_ratio: float = 0.5

How much vertical overlap is tolerated between two subsequent lines before merging them into a single line

hide_figure_layout: bool = False

Hide figures in the linearized output

Hide footers in the linearized output

hide_header_layout: bool = False

Hide headers in the linearized output

hide_page_num_layout: bool = False

Hide page numbers in the linearized output

hide_table_layout: bool = False

Hide tables in the linearized output

key_prefix: str = ''

Prefix for key elements

key_suffix: str = ' '

Suffix for key elements

key_value_layout_prefix: str = ''

Prefix for key_value layout elements (not for individual key-value elements)

key_value_layout_suffix: str = ''

Suffix for key_value layout elements (not for individual key-value elements)

key_value_prefix: str = ''

Prefix for key-value elements

key_value_suffix: str = ''

Suffix for key-value elements

layout_element_separator: str = '\n\n'

Separator to use when combining linearized layout elements

list_element_prefix: str = ''

Prefix for elements in a list layout (children)

list_element_separator: str = '\n'

Separator for elements in a list layout

list_element_suffix: str = ''

Suffix for elements in a list layout (children)

list_layout_prefix: str = ''

Prefix for list layout elements (parent)

list_layout_suffix: str = ''

Suffix for list layout elements (parent)

max_number_of_consecutive_new_lines: int = 2

Removes extra whitespace

page_num_prefix: str = ''

Prefix for page number layout elements

page_num_suffix: str = ''

Suffix for page number layout elements

remove_new_lines_in_leaf_elements: bool = True

Removes new lines in leaf layout elements, this removes extra whitespace

same_layout_element_separator: str = '\n'
same_paragraph_separator: str = ' '
section_header_prefix: str = ''

Prefix for section header layout elements

section_header_suffix: str = ''

Suffix for section header layout elements

selection_element_not_selected: str = '[ ]'
selection_element_selected: str = '[X]'
signature_token: str = '[SIGNATURE]'

Signature representation in the linearized text

table_cell_empty_cell_placeholder: str = ''

Placeholder for empty cells

table_cell_header_prefix: str = ''

Prefix for header cell

table_cell_header_suffix: str = ''

Suffix for header cell

table_cell_prefix: str = ''

Prefix for table cell

table_cell_suffix: str = ''

Suffix for table cell

table_column_header_threshold: float = 0.9

Threshold for a row to be selected as header when rendering as markdown. 0.9 means that 90% of the cells must have the is_header_cell flag.

table_column_separator: str = '\t'

Table column separator, used when linearizing layout tables, not used if AnalyzeDocument was called with the TABLES feature

table_layout_prefix: str = '\n\n'

Prefix for table elements

table_layout_suffix: str = '\n'

Suffix for table elements

table_linearization_format: str = 'plaintext'

How to represent tables in the linearized output. Choices are plaintext or markdown.

table_min_table_words: int = 0

Threshold below which tables will be rendered as words instead of using table layout

table_prefix: str = ''
table_remove_column_headers: bool = False

Remove column headers from tables

table_row_prefix: str = ''

Prefix for table row

table_row_separator: str = '\n'

Table row separator

table_row_suffix: str = ''

Suffix for table row

table_suffix: str = ''
table_tabulate_format: str = 'github'

Markdown tabulate format to use when table are linearized as markdown

text_prefix: str = ''

Prefix for text layout elements

text_suffix: str = ''

Suffix for text layout elements

title_prefix: str = ''

Prefix for title layout elements

title_suffix: str = ''

Suffix for title layout elements

value_prefix: str = ''

Prefix for value elements

value_suffix: str = ''

Suffix for value elements