Skip to content

run

autocorpus.run ¤

Module to run the autocorpus pipeline.

Functions¤

run_autocorpus(config, structure, key, output_format) ¤

Run the autocorpus pipeline on a given file.

Parameters:

Name Type Description Default
config

The configuration file to use.

required
structure

The structure of the input files.

required
key

The key in the structure dict for the current file.

required
output_format

The output format to use (JSON or XML).

required
Source code in autocorpus/run.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def run_autocorpus(config, structure, key, output_format):
    """Run the autocorpus pipeline on a given file.

    Args:
        config: The configuration file to use.
        structure: The structure of the input files.
        key: The key in the structure dict for the current file.
        output_format: The output format to use (JSON or XML).
    """
    ac = process_file(
        config=config,
        file_path=Path(structure[key]["main_text"]),
        linked_tables=sorted(Path(lt) for lt in structure[key]["linked_tables"]),
    )

    out_dir = Path(structure[key]["out_dir"])
    if structure[key]["main_text"]:
        key = key.replace("\\", "/")
        if output_format.lower() == "json":
            with open(
                out_dir / f"{Path(key).name}_bioc.json",
                "w",
                encoding="utf-8",
            ) as outfp:
                outfp.write(ac.main_text_to_bioc_json())
        else:
            with open(
                out_dir / f"{Path(key).name}_bioc.xml",
                "w",
                encoding="utf-8",
            ) as outfp:
                outfp.write(ac.main_text_to_bioc_xml())
        with open(
            out_dir / f"{Path(key).name}_abbreviations.json",
            "w",
            encoding="utf-8",
        ) as outfp:
            outfp.write(ac.abbreviations_to_bioc_json())

        ## TODO: Uncomment when SI conversion is supported
        # out_filename = str(file_path).replace(".pdf", ".pdf_bioc.json")
        # with open(out_filename, "w", encoding="utf-8") as f:
        #     BioCJSON.dump(bioc_text, f, indent=4)

        # out_table_filename = str(file_path).replace(".pdf", ".pdf_tables.json")
        # with open(out_table_filename, "w", encoding="utf-8") as f:
        #     BioCTableJSON.dump(bioc_tables, f, indent=4)

    # AC does not support the conversion of tables or abbreviations to XML
    if ac.has_tables:
        with open(
            out_dir / f"{Path(key).name}_tables.json", "w", encoding="utf-8"
        ) as outfp:
            outfp.write(ac.tables_to_bioc_json())