Skip to content

file_processing

autocorpus.file_processing ¤

Module providing functions for processing files with Auto-CORPus.

Attributes¤

Classes¤

Functions¤

process_directory(config, dir_path) ¤

Process all files in a directory and its subdirectories.

Parameters:

Name Type Description Default
config dict[str, Any]

Configuration dictionary for the input HTML journal articles

required
dir_path Path

Path to the directory containing files to be processed.

required

Returns:

Type Description
Iterable[Autocorpus]

A generator yielding Autocorpus objects for each processed file.

Source code in autocorpus/file_processing.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
def process_directory(config: dict[str, Any], dir_path: Path) -> Iterable[Autocorpus]:
    """Process all files in a directory and its subdirectories.

    Args:
        config: Configuration dictionary for the input HTML journal articles
        dir_path: Path to the directory containing files to be processed.

    Returns:
        A generator yielding Autocorpus objects for each processed file.
    """
    for file_path in dir_path.iterdir():
        if file_path.is_file():
            yield process_file(config, file_path)

        elif file_path.is_dir():
            # recursively process all files in the subdirectory
            for sub_file_path in file_path.rglob("*"):
                yield process_file(config, sub_file_path)

process_file(config, file_path, linked_tables=[]) ¤

Process the input file based on its type.

This method checks the file type and processes the file accordingly.

Parameters:

Name Type Description Default
config dict[str, Any]

Configuration dictionary for the input journal articles

required
file_path Path

Path to the article file to be processed

required
linked_tables list[Path]

list of linked table file paths to be included in this run (HTML files only)

[]

Raises:

Type Description
NotImplementedError

For files types with no implemented processing.

ModuleNotFoundError

For PDF processing if required packages are not found.

Source code in autocorpus/file_processing.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
def process_file(
    config: dict[str, Any], file_path: Path, linked_tables: list[Path] = []
) -> Autocorpus:
    """Process the input file based on its type.

    This method checks the file type and processes the file accordingly.

    Args:
        config: Configuration dictionary for the input journal articles
        file_path: Path to the article file to be processed
        linked_tables: list of linked table file paths to be included in this run
            (HTML files only)

    Raises:
        NotImplementedError: For files types with no implemented processing.
        ModuleNotFoundError: For PDF processing if required packages are not found.
    """
    main_text: dict[str, Any] = {}
    tables_dict: dict[str, Any] = {}
    match check_file_type(file_path):
        case FileType.HTML:
            return Autocorpus(
                file_path, *process_html_article(config, file_path, linked_tables)
            )
        case FileType.XML:
            main_text = convert_xml_to_json(file_path)

            return Autocorpus(file_path, main_text, dict(), dict())
        case FileType.PDF:
            try:
                from .pdf import extract_pdf_content

                text, tables = extract_pdf_content(file_path)

                if text:
                    main_text = text.to_dict()

                if tables:
                    tables_dict = tables.to_dict()

                return Autocorpus(file_path, main_text, dict(), tables_dict)

            except ModuleNotFoundError:
                logger.error(
                    "Could not load necessary PDF packages. If you installed "
                    "Auto-CORPUS via pip, you can obtain these with:\n"
                    "    pip install autocorpus[pdf]"
                )
                raise
        case FileType.WORD:
            try:
                from .word import extract_word_content

                text, tbls = extract_word_content(file_path)

                if text:
                    main_text = text.to_dict()

                if tbls:
                    tables_dict = tbls.to_dict()

                return Autocorpus(file_path, main_text, dict(), tables_dict)
            except ModuleNotFoundError:
                logger.error(
                    "Could not load necessary Word packages. Microsoft Word is required to process Word documents on Windows & MAC OS, or alternatively LibreOffice can be used on Linux.\n"
                )
                raise
        case FileType.EXCEL:
            from .spreadsheet import extract_spreadsheet_content

            tbls = extract_spreadsheet_content(file_path)

            if tbls:
                tables_dict = tbls.to_dict()

            return Autocorpus(file_path, dict(), dict(), tables_dict)

        case FileType.UNKNOWN:
            raise NotImplementedError(f"Could not identify file type for {file_path}")

process_files(config, files) ¤

Process all files in a list.

Parameters:

Name Type Description Default
config dict[str, Any]

Configuration dictionary for the input HTML journal articles

required
files list[Path]

list of Paths to the files to be processed.

required

Returns:

Type Description
Iterable[Autocorpus]

A generator yielding Autocorpus objects for each processed file.

Raises:

Type Description
RuntimeError

If the list of files is invalid.

Source code in autocorpus/file_processing.py
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
def process_files(config: dict[str, Any], files: list[Path]) -> Iterable[Autocorpus]:
    """Process all files in a list.

    Args:
        config: Configuration dictionary for the input HTML journal articles
        files: list of Paths to the files to be processed.

    Returns:
        A generator yielding Autocorpus objects for each processed file.

    Raises:
        RuntimeError: If the list of files is invalid.
    """
    if not all(file.is_file() for file in files):
        raise RuntimeError("All files must be valid file paths.")

    for file_path in files:
        yield process_file(config, file_path)