Skip to content

bioc_formatter

autocorpus.bioc_formatter ¤

Top-level BioC collection builder script.

Classes¤

Functions¤

get_formatted_bioc_collection(main_text, file_path) ¤

Constructs a BioC collection from input document-level data.

Parameters:

Name Type Description Default
main_text dict[str, Any]

Input document-level data.

required
file_path Path

Path to the input file.

required

Returns:

Type Description
BioCCollection

BioC collection

Source code in autocorpus/bioc_formatter.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
def get_formatted_bioc_collection(
    main_text: dict[str, Any],
    file_path: Path,
) -> BioCCollection:  # TODO: Change return type to ac_bioc.BioCCollection
    """Constructs a BioC collection from input document-level data.

    Args:
        main_text: Input document-level data.
        file_path: Path to the input file.

    Returns:
        BioC collection
    """
    bioc_collection = BioCCollection(
        date=datetime.today().strftime("%Y%m%d"),
        documents=[get_formatted_bioc_document(main_text, file_path)],
        source="Auto-CORPus (full-text)",
        key="autocorpus_fulltext.key",
    )
    return bioc_collection

get_formatted_bioc_document(main_text, file_path) ¤

Constructs the BioC document template using the provided data store.

Parameters:

Name Type Description Default
main_text dict[str, Any]

Input document-level data.

required
file_path Path

Path to the input file.

required

Returns:

Type Description
BioCDocument

BioC document complete populated with passages.

Source code in autocorpus/bioc_formatter.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
def get_formatted_bioc_document(
    main_text: dict[str, Any],
    file_path: Path,
) -> BioCDocument:  # TODO: Change return type to ac_bioc.BioCDocument
    """Constructs the BioC document template using the provided data store.

    Args:
        main_text: Input document-level data.
        file_path: Path to the input file.

    Returns:
        BioC document complete populated with passages.
    """
    # build document passages
    seen_headings = []
    passages = [BioCPassage().from_title(main_text["title"], 0)]
    offset = 0  # offset for passage start position
    if main_text["title"] not in seen_headings:
        offset = len(main_text["title"])
        seen_headings.append(main_text["title"])
    for passage in main_text["paragraphs"]:
        passage["offset"] = offset
        passage_obj = BioCPassage().from_ac_dict(passage)
        passages.append(passage_obj)
        offset += len(passage["body"])
        if passage["subsection_heading"] not in seen_headings:
            offset += len(passage["subsection_heading"])
            seen_headings.append(passage["subsection_heading"])
        if passage["section_heading"] not in seen_headings:
            offset += len(passage["section_heading"])
            seen_headings.append(passage["section_heading"])

    return BioCDocument(
        id=file_path.name.split(".")[0], inputfile=str(file_path), passages=passages
    )