Skip to content

table

autocorpus.table ¤

Tables-JSON top-level builder script.

Functions¤

__check_superrow(cells) ¤

Check if the current row is a superrow.

Superrows contain cells that are split and contain more values than other cells on the same row.

Parameters:

Name Type Description Default
cells list[str]

Cells in row

required
Source code in autocorpus/table.py
101
102
103
104
105
106
107
108
109
110
111
112
113
def __check_superrow(cells: list[str]) -> bool:
    """Check if the current row is a superrow.

    Superrows contain cells that are split and contain more values than other cells on
    the same row.

    Args:
        cells: Cells in row
    """
    cleaned_row: set[str] = set(
        cell for cell in cells if cell not in ("", "\n", "None")
    )
    return len(cleaned_row) == 1 and bool(re.match("[a-zA-Z]", next(iter(cleaned_row))))

__get_headers(t, config) ¤

Identify headers from a table.

Parameters:

Name Type Description Default
t BeautifulSoup

BeautifulSoup object of table

required
config dict[str, Any]

Configuration dictionary

required

Returns:

Type Description
list[int]

List of header indexes

Raises:

Type Description
KeyError

Missing element

Source code in autocorpus/table.py
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
def __get_headers(t: BeautifulSoup, config: dict[str, Any]) -> list[int]:
    """Identify headers from a table.

    Args:
        t: BeautifulSoup object of table
        config: Configuration dictionary

    Returns:
        List of header indexes

    Raises:
        KeyError: Missing element
    """
    idx_list: list[int] = []
    for idx, row in enumerate(
        get_data_element_node(config["tables"]["data"]["table-row"], t)
    ):
        if get_data_element_node(config["tables"]["data"]["header-element"], row):
            idx_list.append(idx)
        elif "class" in row.attrs:
            if "thead" in row.attrs["class"]:
                idx_list.append(idx)
    # if no table headers found
    if idx_list == []:
        idx_list = [0]
    return idx_list

__is_mix(s) ¤

Check if input string is a mix of number and text.

Parameters:

Name Type Description Default
s str

input string

required

Returns:

Type Description
bool

True/False

Source code in autocorpus/table.py
161
162
163
164
165
166
167
168
169
170
171
172
173
def __is_mix(s: str) -> bool:
    """Check if input string is a mix of number and text.

    Args:
        s (str): input string

    Returns:
        (bool): True/False

    """
    if any(char.isdigit() for char in s):
        return any(not char.isdigit() for char in s)
    return False

__is_number(s) ¤

Check if input string is a number.

Parameters:

Name Type Description Default
s str

input string

required

Returns:

Type Description
bool

True/False

Source code in autocorpus/table.py
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
def __is_number(s: str) -> bool:
    """Check if input string is a number.

    Args:
        s (str): input string

    Returns:
        (bool): True/False

    """
    try:
        float(s.replace(",", ""))
        return True
    except ValueError:
        return False

__is_text(s) ¤

Check if input string is all text.

Parameters:

Name Type Description Default
s str

input string

required

Returns:

Type Description
bool

True/False

Source code in autocorpus/table.py
176
177
178
179
180
181
182
183
184
185
186
def __is_text(s: str) -> bool:
    """Check if input string is all text.

    Args:
        s: input string

    Returns:
        True/False

    """
    return not any(char.isdigit() for char in s)

__table2json(table_2d, header_idx, subheader_idx, superrow_idx, table_num, title, footer, caption) ¤

Transform tables from nested lists to JSON.

Parameters:

Name Type Description Default
table_2d list

nested list tables

required
header_idx list

list of header indices

required
subheader_idx list

list of subheader indices

required
superrow_idx list

list of superrow indices

required
table_num int

table number

required
title str

table title

required
footer str

table footer

required
caption str

table caption

required

Returns:

Name Type Description
tables list

tables in JSON format

Source code in autocorpus/table.py
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
def __table2json(
    table_2d,
    header_idx,
    subheader_idx,
    superrow_idx,
    table_num,
    title,
    footer,
    caption,
):
    """Transform tables from nested lists to JSON.

    Args:
        table_2d (list): nested list tables
        header_idx (list): list of header indices
        subheader_idx (list): list of subheader indices
        superrow_idx (list): list of superrow indices
        table_num (int): table number
        title (str): table title
        footer (str): table footer
        caption (str): table caption

    Returns:
        tables (list): tables in JSON format

    """
    tables = []
    sections = []
    cur_table = {}
    cur_section = {}

    pre_header = []
    pre_superrow = None
    cur_header = ""
    cur_superrow = ""
    for row_idx, row in enumerate(table_2d):
        if not any(i for i in row if i not in ("", "None")):
            continue
        if row_idx in header_idx:
            cur_header = [
                table_2d[i] for i in next(i for i in subheader_idx if row_idx in i)
            ]
        elif row_idx in superrow_idx:
            cur_superrow = next(i for i in row if i not in ("", "None"))
        else:
            if cur_header != pre_header:
                sections = []
                pre_superrow = None
                cur_table = {
                    "identifier": str(table_num + 1),
                    "title": title,
                    "caption": caption,
                    "columns": cur_header,
                    "section": sections,
                    "footer": footer,
                }
                tables.append(cur_table)
            elif cur_header == pre_header:
                cur_table["section"] = sections
            if cur_superrow != pre_superrow:
                cur_section = {"section_name": cur_superrow, "results": [row]}
                sections.append(cur_section)
            elif cur_superrow == pre_superrow:
                cur_section["results"].append(row)

            pre_header = cur_header
            pre_superrow = cur_superrow

    if len(tables) > 1:
        for table_idx, table in enumerate(tables):
            table["identifier"] += f".{table_idx + 1}"
    return tables

__table_to_2d(t) ¤

Transform tables from nested lists to JSON.

Parameters:

Name Type Description Default
t BeautifulSoup

HTML table

required

Returns:

Type Description
list[list[str]]

Table structure as a nested list

Source code in autocorpus/table.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
def __table_to_2d(t: BeautifulSoup) -> list[list[str]]:
    """Transform tables from nested lists to JSON.

    Args:
        t: HTML table

    Returns:
        Table structure as a nested list

    """
    # https://stackoverflow.com/questions/48393253/how-to-parse-table-with-rowspan-and-colspan
    rows: list[Tag] = t.find_all("tr")
    # fill colspan and rowspan
    for row in rows:
        for col in row.findAll(["th", "td"]):
            if "colspan" not in col.attrs:
                col.attrs["colspan"] = 1
            if "rowspan" not in col.attrs:
                col.attrs["rowspan"] = 1

    # first scan, see how many columns we need
    temp_row = t.find("tr")
    if not temp_row:
        return []
    temp_cells: list[Tag] = (
        temp_row.findAll(["th", "td"]) if isinstance(temp_row, Tag) else []
    )
    n_cols: int = (
        sum([int(i.attrs["colspan"]) for i in temp_cells]) if temp_cells else 0
    )

    # build an empty matrix for all possible cells
    table: list[list[str]] = [[""] * n_cols for row in rows]

    # fill matrix from row data
    # track pending rowspans, column number mapping to count
    rowspans: dict[int, int] = {}
    for row_idx, row in enumerate(rows):
        span_offset: int = 0  # how many columns are skipped due to row and colspans
        for col_idx, cell in enumerate(row.findAll(["td", "th"])):
            # adjust for preceding row and colspans
            col_idx += span_offset
            while rowspans.get(col_idx, 0):
                span_offset += 1
                col_idx += 1

            # fill table data
            rowspan = int(cell.attrs["rowspan"])
            rowspans[col_idx] = rowspan
            colspan: int = int(cell.attrs["colspan"])

            # next column is offset by the colspan
            span_offset += colspan - 1
            value: str = ""
            for item in cell.contents:
                value += navigate_contents(item)

            # clean the cell
            value = value.strip().replace("\u2009", " ").replace(" ", " ")
            value = re.sub(r"\s", " ", value)
            value = re.sub("<\\/?span[^>\n]*>?|<hr\\/>?", "", value)
            value = re.sub("\\n", "", value)
            if value.startswith("(") and value.endswith(")"):
                value = value[1:-1]
            if re.match(PVAL_REGEX, value):
                value = re.sub(
                    r"(\s{0,1})[*××xX](\s{0,1})10(_{0,1})", "e", value
                ).replace("−", "-")
            if re.match(PVAL_SCIENTIFIC_REGEX, value):
                value = re.sub(r"(\s{0,1})[–−-](\s{0,1})", "-", value)
                value = re.sub(r"(\s{0,1})[eE]", "e", value)
            for drow, dcol in product(range(rowspan), range(colspan)):
                try:
                    table[row_idx + drow][col_idx + dcol] = value
                    rowspans[col_idx + dcol] = rowspan
                except IndexError:
                    # rowspan or colspan outside the confines of the table
                    pass

        # update rowspan bookkeeping
        rowspans = {c: s - 1 for c, s in rowspans.items() if s > 1}
    return table

get_table_json(soup, config, file_path) ¤

Extracts and processes tables from an HTML document.

This is done using BeautifulSoup and a configuration dictionary.

The function performs the following steps: 1. Extracts tables from the HTML document based on the provided configuration. 2. Removes empty tables and tables with specific classes (e.g., "table-group"). 3. Identifies and processes table headers, superrows, and subheaders. 4. Converts tables into a 2D format and processes cell data types (e.g., numeric, text, mixed). 5. Converts the processed table data into a JSON-compatible format. 6. Merges headers and formats the final table data for output.

Parameters:

Name Type Description Default
soup BeautifulSoup

A BeautifulSoup object representing the parsed HTML document.

required
config dict[str, Any]

A dictionary containing configuration options for table processing.

required
file_path Path

The file name or path of the HTML document being processed.

required

Returns:

Type Description
tuple[dict[str, Any], list[dict[str, Any]]]

A dictionary containing the processed table data in JSON format and a list of dictionaries representing empty tables.

Source code in autocorpus/table.py
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
def get_table_json(
    soup: BeautifulSoup, config: dict[str, Any], file_path: Path
) -> tuple[dict[str, Any], list[dict[str, Any]]]:
    """Extracts and processes tables from an HTML document.

    This is done using BeautifulSoup and a configuration dictionary.

    The function performs the following steps:
    1. Extracts tables from the HTML document based on the provided configuration.
    2. Removes empty tables and tables with specific classes (e.g., "table-group").
    3. Identifies and processes table headers, superrows, and subheaders.
    4. Converts tables into a 2D format and processes cell data types (e.g., numeric,
        text, mixed).
    5. Converts the processed table data into a JSON-compatible format.
    6. Merges headers and formats the final table data for output.

    Args:
        soup: A BeautifulSoup object representing the parsed HTML document.
        config: A dictionary containing configuration options for table processing.
        file_path: The file name or path of the HTML document being processed.

    Returns:
        A dictionary containing the processed table data in JSON format and a list of
            dictionaries representing empty tables.
    """
    soup_tables: list[dict[str, Any]] = handle_tables(config["tables"], soup)

    file_name = file_path.name
    table_identifier: str | None = None
    if re.search(r"_table_\d+\.html", file_name):
        table_identifier = file_name.split("/")[-1].split("_")[-1].split(".")[0]

    # remove empty table and other table classes
    pop_list: list[int] = []
    empty_tables: list[dict[str, str]] = []

    for i, table in enumerate(soup_tables):
        if "class" in table["node"].attrs:
            if "table-group" in table["node"].attrs["class"]:
                pop_list.append(i)
        if table["node"].find_all("tbody") == []:
            pop_list.append(i)
            empty_tables.append(table)
    soup_tables = [table for i, table in enumerate(soup_tables) if i not in pop_list]

    # One table
    tables = []
    for table_num, table in enumerate(soup_tables):
        # remove empty table header
        if header := table["node"].find("td", "thead-hr"):
            header.parent.extract()

        header_idx: list[int] = __get_headers(table["node"], config)

        # span table to single-cells
        table_2d: list[list[Any]] | None = __table_to_2d(table["node"])
        if table_2d is None:
            continue

        # find superrows
        superrow_idx = []
        for row_idx, row in enumerate(table_2d):
            if row_idx not in header_idx and __check_superrow(row):
                superrow_idx.append(row_idx)

        # identify section names in index column
        if superrow_idx == []:
            first_col = [row[0] for row in table_2d]
            first_col_vals = [
                i for i in first_col if first_col.index(i) not in header_idx
            ]
            unique_vals = dict.fromkeys(
                i for i in first_col_vals if i not in ("", "None")
            )
            if len(unique_vals) <= len(first_col_vals) / 2:
                section_names = list(unique_vals)
                for i in section_names:
                    superrow_idx.append(first_col.index(i))
                n_cols = len(table_2d[0])
                for idx, val in zip(superrow_idx, section_names):
                    table_2d = table_2d[:idx] + [[val] * n_cols] + table_2d[idx:]
                # update superrow_idx after superrow insertion
                superrow_idx = []
                first_col = [row[0] for row in table_2d]
                for i in section_names:
                    superrow_idx.append(first_col.index(i))
                for row in table_2d:
                    row.pop(0)

        # Identify subheaders
        value_idx = [
            i for i in range(len(table_2d)) if i not in header_idx + superrow_idx
        ]
        col_type = []
        for col_idx in range(len(table_2d[0])):
            cur_col = [i[col_idx] for i in table_2d]
            num_cnt = 0
            txt_cnt = 0
            mix_cnt = 0
            for cell in cur_col:
                cell = str(cell).lower()
                if cell in [
                    "none",
                    "",
                    "-",
                ]:
                    continue
                elif __is_number(cell):
                    num_cnt += 1
                elif __is_mix(cell):
                    mix_cnt += 1
                elif __is_text(cell):
                    txt_cnt += 1
            if max(num_cnt, txt_cnt, mix_cnt) == num_cnt:
                col_type.append("num")
            elif max(num_cnt, txt_cnt, mix_cnt) == txt_cnt:
                col_type.append("txt")
            else:
                col_type.append("mix")
        subheader_idx: list[int] = []
        for row_idx in value_idx:
            cur_row = table_2d[row_idx]
            unmatch_cnt = 0
            for col_idx, cell in enumerate(cur_row):
                cell = str(cell).lower()
                if (
                    __is_text(cell)
                    and col_type[col_idx] != "txt"
                    and cell
                    not in [
                        "none",
                        "",
                        "-",
                    ]
                ):
                    unmatch_cnt += 1
            if unmatch_cnt >= len(cur_row) / 2:
                subheader_idx.append(row_idx)
        header_idx += subheader_idx

        new_subheader_idx: list[list[int]] = []
        tmp: list[int] = [header_idx[0]]
        for i, j in pairwise(header_idx):
            if j == i + 1:
                tmp.append(j)
            else:
                new_subheader_idx.append(tmp)
                tmp = [j]
        new_subheader_idx.append(tmp)

        # convert to float
        for row in table_2d:
            for cell in range(len(row)):
                try:
                    row[cell] = float(
                        row[cell].replace("−", "-").replace("–", "-").replace(",", "")
                    )
                except Exception:
                    row[cell] = row[cell]

        cur_table = __table2json(
            table_2d,
            header_idx,
            new_subheader_idx,
            superrow_idx,
            table_num,
            table["title"],
            table["footer"],
            table["caption"],
        )
        # merge headers
        sep = "|"
        for table in cur_table:
            headers = table["columns"]
            new_header = []
            if not headers:
                continue
            for col_idx in range(len(headers[0])):
                new_element = ""
                for r_idx in range(len(headers)):
                    new_element += str(headers[r_idx][col_idx]) + sep
                new_element = new_element.rstrip(sep)
                new_header.append(new_element)
            table["columns"] = new_header

        tables += cur_table

    table_json = {"tables": tables}
    table_json = __format_table_bioc(table_json, table_identifier, str(file_path))
    return table_json, empty_tables