Skip to content

utils

autocorpus.utils ¤

Utility script containing various functions used throughout AC in different use-cases.

Functions¤

config_anchors(value) ¤

Clean the regex anchors of an AC config rule.

Parameters:

Name Type Description Default
value str

AC config anchor value

required

Returns:

Type Description
str

Cleaned regex with missing ^ and $ characters added.

Source code in autocorpus/utils.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def config_anchors(value):
    """Clean the regex anchors of an AC config rule.

    Args:
        value (str): AC config anchor value

    Returns:
        (str): Cleaned regex with missing ^ and $ characters added.
    """
    if not value.startswith("^"):
        value = f"^{value}"
    if not value.endswith("$"):
        value = f"{value}$"
    return value

config_attr_block(block) ¤

Parse the attributes block of an AC config file.

Parameters:

Name Type Description Default
block dict

attributes block of an AC config file

required

Returns:

Type Description

(dict) regex compiled & cleaned attributes block

Source code in autocorpus/utils.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
def config_attr_block(block):
    """Parse the attributes block of an AC config file.

    Args:
        block (dict): attributes block of an AC config file

    Returns:
        (dict) regex compiled & cleaned attributes block
    """
    ret = {}
    for key in block:
        if isinstance(block[key], list):
            ret[key] = [re.compile(config_anchors(x)) for x in block[key]]
        elif isinstance(block[key], str):
            ret[key] = re.compile(config_anchors(block[key]))
    return ret

config_attrs(attrs) ¤

Clean and compile attributes block of an AC config file.

Parameters:

Name Type Description Default
attrs list of dicts or dict

attributes block of an AC config file

required

Returns:

Type Description
list

cleaned and compiled attributes block of an AC config file

Source code in autocorpus/utils.py
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
def config_attrs(attrs):
    """Clean and compile attributes block of an AC config file.

    Args:
        attrs (list of dicts or dict): attributes block of an AC config file

    Returns:
        (list): cleaned and compiled attributes block of an AC config file
    """
    ret = []
    if isinstance(attrs, list):
        for attr in attrs:
            ret.extend(config_attr_block(attr))
    elif isinstance(attrs, dict):
        ret = config_attr_block(attrs)
    else:
        quit(f"{attrs} must be a dict or a list of dicts")
    return ret

config_tags(tags) ¤

Parse the tags block of an AC config file.

Parameters:

Name Type Description Default
tags list or str

tags block of an AC config file

required

Returns:

Type Description
list

cleaned and compiled tags block of an AC config file

Source code in autocorpus/utils.py
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
def config_tags(tags):
    """Parse the tags block of an AC config file.

    Args:
        tags (list or str): tags block of an AC config file

    Returns:
        (list): cleaned and compiled tags block of an AC config file
    """
    ret = []
    if isinstance(tags, list):
        for tag in tags:
            if isinstance(tag, str):
                ret.append(re.compile(config_anchors(tag)))
            else:
                quit(f"{tags} must be a string or list of strings")
    elif isinstance(tags, str):
        ret.append(re.compile(config_anchors(tags)))
    else:
        quit(f"{tags} must be a string or list of strings")
    return ret

get_data_element_node(config, soup) ¤

Retrieve the matches for the data element node config rules.

Parameters:

Name Type Description Default
config dict

Parsed config rules to be used

required
soup BeautifulSoup

BeautifulSoup object containing the input text to search

required

Returns:

Type Description
list

Matches for the data element node config rules

Source code in autocorpus/utils.py
248
249
250
251
252
253
254
255
256
257
258
259
def get_data_element_node(config, soup):
    """Retrieve the matches for the data element node config rules.

    Args:
        config (dict): Parsed config rules to be used
        soup (bs4.BeautifulSoup): BeautifulSoup object containing the input text to search

    Returns:
        (list): Matches for the data element node config rules
    """
    config = {"defined-by": config}
    return handle_defined_by(config, soup)

get_files(base_dir, pattern='(.*).html') ¤

Recursively retrieve all PMC.html files from the directory.

Parameters:

Name Type Description Default
base_dir

base directory

required
pattern

file name filter REGEX pattern (default *.html)

'(.*).html'
Return

file_list: a list of filepath

Source code in autocorpus/utils.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
def get_files(base_dir, pattern=r"(.*).html"):
    """Recursively retrieve all PMC.html files from the directory.

    Args:
        base_dir: base directory
        pattern: file name filter REGEX pattern (default *.html)

    Return:
        file_list: a list of filepath

    """
    file_list = []
    base_dir = Path(base_dir)
    for item in base_dir.iterdir():
        abs_path = item.resolve()
        if abs_path.is_file() and re.match(pattern, str(abs_path)):
            file_list.append(str(abs_path))
        elif abs_path.is_dir() and "ipynb_checkpoints" not in str(abs_path):
            file_list += get_files(abs_path, pattern)
    return file_list

handle_defined_by(config, soup) ¤

Retrieve matching nodes for the 'defined-by' config rules.

Parameters:

Name Type Description Default
config dict[str, Any]

config file section used to parse

required
soup BeautifulSoup

soup section to parse

required

Returns:

Type Description
list[Tag]

A list of objects, each object being a matching node. Object of the form: { node: bs4Object, data:{ key: [values] } }

list[Tag]

node is a bs4 object of a single result derived from bs4.find_all()

list[Tag]

data is an object where the results from the config "data" sections is housed.

list[Tag]

The key is the name of the data section and the values are all matches found

list[Tag]

within any of the main matches which match the current data section definition.

list[Tag]

The values is the response you get from get_text() on any found nodes, not the

list[Tag]

nodes themselves.

Source code in autocorpus/utils.py
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
def handle_defined_by(config: dict[str, Any], soup: BeautifulSoup) -> list[Tag]:
    """Retrieve matching nodes for the 'defined-by' config rules.

    Args:
        config: config file section used to parse
        soup: soup section to parse

    Returns:
        A list of objects, each object being a matching node. Object of the form:
                {
                        node: bs4Object,
                        data:{
                                        key: [values]
                                }
                }
        node is a bs4 object of a single result derived from bs4.find_all()
        data is an object where the results from the config "data" sections is housed.
        The key is the name of the data section and the values are all matches found
        within any of the main matches which match the current data section definition.
        The values is the response you get from get_text() on any found nodes, not the
        nodes themselves.
    """
    if "defined-by" not in config:
        quit(f"{config} does not contain the required 'defined-by' key.")
    matches = []
    seen_text = []
    for definition in config["defined-by"]:
        bs_attrs = parse_configs(definition)
        new_matches = []  # type: ignore[var-annotated]
        if bs_attrs["name"] or bs_attrs["attrs"]:
            new_matches = soup.find_all(
                bs_attrs["name"] if bs_attrs["name"] else None,
                bs_attrs["attrs"] if bs_attrs["attrs"] else None,
            )
            if new_matches:
                new_matches = [x for x in new_matches if x.text]
        if "xpath" in bs_attrs:
            if isinstance(bs_attrs["xpath"], list):
                for path in bs_attrs["xpath"]:
                    xpath_matches = fromstring(str(soup)).xpath(path)
                    if xpath_matches:
                        for new_match in xpath_matches:
                            new_match = bs4.BeautifulSoup(
                                etree.tostring(
                                    new_match, encoding="unicode", method="html"
                                ),
                                "html.parser",
                            )
                            if new_match.text.strip():
                                new_matches.extend(new_match)
            else:
                xpath_matches = fromstring(str(soup)).xpath(bs_attrs["xpath"])
                if xpath_matches:
                    for new_match in xpath_matches:
                        new_match = bs4.BeautifulSoup(
                            etree.tostring(
                                new_match, encoding="unicode", method="html"
                            ),
                            "html.parser",
                        )
                        if new_match.text.strip():
                            new_matches.extend(new_match)
        for match in new_matches:
            if type(match) is not NavigableString:
                matched_text = match.get_text()
            if matched_text in seen_text:
                continue
            else:
                seen_text.append(matched_text)
                matches.append(match)
    return matches

handle_not_tables(config, soup) ¤

Executes a search on non-table bs4 soup objects based on provided config rules.

Parameters:

Name Type Description Default
config dict[str, Any]

Parsed config rules to be used

required
soup BeautifulSoup

BeautifulSoup object containing the input text to search

required

Returns:

Type Description
list[dict[str, Tag | list[str]]]

A list of matches for the provided config rules. Either as a Tag or a list of strings.

Source code in autocorpus/utils.py
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
def handle_not_tables(
    config: dict[str, Any],
    soup: BeautifulSoup,
) -> list[dict[str, Tag | list[str]]]:
    """Executes a search on non-table bs4 soup objects based on provided config rules.

    Args:
        config: Parsed config rules to be used
        soup: BeautifulSoup object containing the input text to search

    Returns:
        A list of matches for the provided config rules. Either as a Tag or a list of
            strings.
    """
    responses = []
    matches = handle_defined_by(config, soup)
    if "data" in config:
        for match in matches:
            response_addition: dict[str, Tag | list[str]] = {"node": match}
            for ele in config["data"]:
                seen_text = set()
                for definition in config["data"][ele]:
                    bs_attrs = parse_configs(definition)
                    new_matches = match.find_all(
                        bs_attrs["name"] if bs_attrs["name"] else None,
                        bs_attrs["attrs"] if bs_attrs["attrs"] else None,
                    )
                    if new_matches:
                        response_addition[ele] = []
                    for new_match in new_matches:
                        text = new_match.get_text()
                        if text in seen_text:
                            continue
                        seen_text.add(text)
                        response_addition[ele].append(text)
            responses.append(response_addition)
    else:
        for match in matches:
            response_addition = {"node": match}
            responses.append(response_addition)
    return responses

handle_tables(config, soup) ¤

Parse the provided BeautifulSoup object containing tables using the provided config rules.

Parameters:

Name Type Description Default
config dict

Parsed config rules to be used

required
soup BeautifulSoup

BeautifulSoup object containing the input tables to construct

required

Returns:

Type Description
list

List of matches for the provided config rules

Source code in autocorpus/utils.py
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
def handle_tables(config, soup):
    """Parse the provided BeautifulSoup object containing tables using the provided config rules.

    Args:
        config (dict): Parsed config rules to be used
        soup (bs4.BeautifulSoup): BeautifulSoup object containing the input tables to construct

    Returns:
        (list): List of matches for the provided config rules
    """
    responses = []
    matches = handle_defined_by(config, soup)
    text_data = ["caption", "title", "footer"]
    if "data" in config:
        for match in matches:
            response_addition = {
                "node": match,
                "title": "",
                "footer": "",
                "caption": "",
            }
            for ele in config["data"]:
                if ele in text_data:
                    seen_text = set()
                    for definition in config["data"][ele]:
                        bs_attrs = parse_configs(definition)
                        new_matches = match.find_all(
                            bs_attrs["name"] if bs_attrs["name"] else None,
                            bs_attrs["attrs"] if bs_attrs["attrs"] else None,
                        )
                        if new_matches:
                            response_addition[ele] = []
                        for newMatch in new_matches:
                            if newMatch.get_text() in seen_text:
                                continue
                            else:
                                value = ""
                                for item in newMatch.contents:
                                    value += navigate_contents(item)

                                # clean the cell
                                value = value.strip().replace("\u2009", " ")
                                value = re.sub("<\\/?span[^>\n]*>?|<hr\\/>?", "", value)
                                value = re.sub("\\n", "", value)
                                response_addition[ele].append(value)
            responses.append(response_addition)
    else:
        for match in matches:
            response_addition = {"node": match}
            responses.append(response_addition)
    return responses

navigate_contents(item) ¤

Extract nested text recursively from the provided NavigableString/Tag item.

Parameters:

Name Type Description Default
item NavigableString or Tag

Root element/tag to extract nested text from.

required

Returns:

Type Description

(str) Text nested within the provided item.

Source code in autocorpus/utils.py
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
def navigate_contents(item):
    """Extract nested text recursively from the provided NavigableString/Tag item.

    Args:
        item (bs4.element.NavigableString or bs4.element.Tag): Root element/tag to extract nested text from.

    Returns:
        (str) Text nested within the provided item.
    """
    value = ""
    if isinstance(item, bs4.element.NavigableString):
        value += unicodedata.normalize("NFKD", item)
    if isinstance(item, bs4.element.Tag):
        if item.name == "sup" or item.name == "sub":
            value += "<" + item.name + ">"
            for childItem in item.contents:
                value += navigate_contents(childItem)
            value += "</" + item.name + ">"
        else:
            for childItem in item.contents:
                value += navigate_contents(childItem)
    return value

parse_configs(definition) ¤

Parse a top-level block of an AC config file.

Parameters:

Name Type Description Default
definition dict

top-level block of an AC config file.

required

Returns:

Type Description
dict

cleaned and compiled block of an AC config file.

Source code in autocorpus/utils.py
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
def parse_configs(definition):
    """Parse a top-level block of an AC config file.

    Args:
        definition (dict): top-level block of an AC config file.

    Returns:
        (dict): cleaned and compiled block of an AC config file.
    """
    bs_attrs = {"name": [], "attrs": [], "xpath": []}
    if "tag" in definition:
        bs_attrs["name"] = config_tags(definition["tag"])
    if "attrs" in definition:
        bs_attrs["attrs"] = config_attrs(definition["attrs"])
    if "xpath" in definition:
        bs_attrs["xpath"] = definition["xpath"]
    return bs_attrs