Skip to content

section

autocorpus.section ¤

Handles section processing for Auto-CORPus.

Modules used: - re: regular expression searching/replacing. - nltk: string tokenization - fuzzywuzzy: string-in-string ratio

Attributes¤

Classes¤

Functions¤

get_iao_term_mapping(section_heading) ¤

Get the IAO term mapping for a given section heading.

Parameters:

Name Type Description Default
section_heading str

The name of the section heading.

required

Returns:

Type Description
list[dict[str, str]]

The IAO term mapping for the section heading.

Source code in autocorpus/section.py
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
def get_iao_term_mapping(section_heading: str) -> list[dict[str, str]]:
    """Get the IAO term mapping for a given section heading.

    Args:
        section_heading: The name of the section heading.

    Returns:
        The IAO term mapping for the section heading.
    """
    mapping_dict = read_mapping_file()
    tokenized_section_heading = nltk.wordpunct_tokenize(section_heading)
    text = nltk.Text(tokenized_section_heading)
    words = [w.lower() for w in text]
    h2_tmp = " ".join(word for word in words)

    # TODO: check for best match, not the first
    mapping_result = []
    if h2_tmp != "":
        if any(x in h2_tmp for x in [" and ", "&", "/"]):
            h2_parts = re.split(r" and |\s?/\s?|\s?&\s?", h2_tmp)
            for h2_part in h2_parts:
                h2_part = re.sub(r"^\d*\s?[\(\.]]?\s?", "", h2_part)
                for IAO_term, heading_list in mapping_dict.items():
                    if any(
                        fuzz.ratio(h2_part, heading) >= 80 for heading in heading_list
                    ):
                        mapping_result.append(get_iao_term_to_id_mapping(IAO_term))
                        break

        else:
            for IAO_term, heading_list in mapping_dict.items():
                h2_tmp = re.sub(r"^\d*\s?[\(\.]]?\s?", "", h2_tmp)
                if any([fuzz.ratio(h2_tmp, heading) > 80 for heading in heading_list]):
                    mapping_result = [get_iao_term_to_id_mapping(IAO_term)]
                    break

    if mapping_result == []:
        return [{"iao_name": "document part", "iao_id": "IAO:0000314"}]

    return mapping_result

get_iao_term_to_id_mapping(iao_term) ¤

Map IAO terms to IAO IDs.

Parameters:

Name Type Description Default
iao_term str

IAO term to map to an IAO ID.

required

Returns:

Type Description
dict[str, str]

A dictionary containing the IAO term and its corresponding ID

Source code in autocorpus/section.py
120
121
122
123
124
125
126
127
128
129
130
131
def get_iao_term_to_id_mapping(iao_term: str) -> dict[str, str]:
    """Map IAO terms to IAO IDs.

    Args:
        iao_term: IAO term to map to an IAO ID.

    Returns:
        A dictionary containing the IAO term and its corresponding ID
    """
    mapping_result_id_version = read_iao_term_to_id_file().get(iao_term, "")

    return {"iao_name": iao_term, "iao_id": mapping_result_id_version}

get_section(config, section_dict) ¤

Identifies a section using the provided configuration.

Parameters:

Name Type Description Default
config dict[str, dict[str, Any]]

AC configuration object.

required
section_dict dict[str, Any]

Article section dictionary.

required
Source code in autocorpus/section.py
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
def get_section(
    config: dict[str, dict[str, Any]], section_dict: dict[str, Any]
) -> Iterable[Paragraph]:
    """Identifies a section using the provided configuration.

    Args:
        config: AC configuration object.
        section_dict: Article section dictionary.
    """
    section_heading = section_dict.get("headers", [""])[0]
    section_type = get_iao_term_mapping(section_heading)

    # Different processing for abbreviations and references section types
    if section_heading == "Abbreviations":
        if abbreviations_config := config.get("abbreviations-table", None):
            abbreviations = _get_abbreviations(
                abbreviations_config, section_dict["node"]
            )
            for body in abbreviations:
                yield Paragraph(section_heading, "", body, section_type)
            return

    if {
        "iao_name": "references section",
        "iao_id": "IAO:0000320",
    } in section_type:
        yield from _get_references(config, section_heading, section_dict["node"])
        return

    for child in _get_section(config, section_dict["node"]):
        yield Paragraph(
            section_heading,
            child.subheading,
            child.body,
            section_type,
        )

read_iao_term_to_id_file() cached ¤

Parses the IAO_term_to_ID.txt file.

Returns:

Type Description
dict[str, str]

Parsed IAO ids as a dictionary

Source code in autocorpus/section.py
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
@lru_cache
def read_iao_term_to_id_file() -> dict[str, str]:
    """Parses the IAO_term_to_ID.txt file.

    Returns:
        Parsed IAO ids as a dictionary
    """
    iao_term_to_no_dict = {}
    id_path = resources.files("autocorpus.IAO_dicts") / "IAO_term_to_ID.txt"
    with id_path.open(encoding="utf-8") as f:
        lines = f.readlines()
        for line in lines:
            iao_term, _, iao_no = line.rstrip("\n").partition("\t")
            iao_term_to_no_dict.update({iao_term: iao_no})
    return iao_term_to_no_dict

read_mapping_file() cached ¤

Reads the IAO mapping file and parses it into a dictionary.

Returns:

Type Description
dict[str, list[str]]

The parsed IAO mappings

Source code in autocorpus/section.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
@lru_cache
def read_mapping_file() -> dict[str, list[str]]:
    """Reads the IAO mapping file and parses it into a dictionary.

    Returns:
        The parsed IAO mappings
    """
    mapping_dict: dict[str, list[str]] = {}
    mapping_path = resources.files("autocorpus.IAO_dicts") / "IAO_FINAL_MAPPING.txt"
    with mapping_path.open(encoding="utf-8") as f:
        lines = f.readlines()
        for line in lines:
            heading, _, iao_term = line.rstrip("\n").lower().partition("\t")
            if iao_term != "":
                if "/" in iao_term:
                    iao_term_1 = iao_term.split("/")[0].strip(" ")
                    iao_term_2 = iao_term.split("/")[1].strip(" ")
                    if iao_term_1 in mapping_dict.keys():
                        mapping_dict[iao_term_1].append(heading)
                    else:
                        mapping_dict.update({iao_term_1: [heading]})

                    if iao_term_2 in mapping_dict.keys():
                        mapping_dict[iao_term_2].append(heading)
                    else:
                        mapping_dict.update({iao_term_2: [heading]})

                else:
                    if iao_term in mapping_dict.keys():
                        mapping_dict[iao_term].append(heading)
                    else:
                        mapping_dict.update({iao_term: [heading]})
    return mapping_dict