Skip to content

word

autocorpus.word ¤

This module provides functionality to extract text and tables from Word documents (.doc and .docx).

It includes methods to handle older .doc files by converting them to .docx format and processing them.

Attributes¤

Classes¤

Functions¤

__convert_older_doc_file(file, output_dir) ¤

Converts an older .doc file to .docx format using platform-specific methods.

Source code in autocorpus/word.py
144
145
146
147
148
149
150
151
152
153
154
def __convert_older_doc_file(file: Path, output_dir: Path) -> Path | None:
    """Converts an older .doc file to .docx format using platform-specific methods."""
    operating_system = platform.system()
    docx_path = output_dir / file.with_suffix(".docx").name

    if operating_system == "Windows":
        return __windows_convert_doc_to_docx(docx_path, file)
    elif operating_system == "Darwin":  # macOS
        return __macos_convert_doc_to_docx(docx_path, file)
    else:
        return __linux_convert_doc_to_docx(docx_path, file)  # Fallback to Linux method

__extract_tables(doc) ¤

Extracts tables from a .docx document as a list of DataFrames.

Parameters:

Name Type Description Default
doc Document

The Document object representing the .docx document.

required

Returns:

Type Description
list[DataFrame]

List[pd.DataFrame]: A list of pandas DataFrames, each representing a table in the document.

Example

from docx import Document

doc = Document("document.docx") tables = __extract_tables(doc)

Source code in autocorpus/word.py
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def __extract_tables(doc: DocumentObject) -> list[DataFrame]:
    """Extracts tables from a .docx document as a list of DataFrames.

    Args:
        doc: The Document object representing the .docx document.

    Returns:
        List[pd.DataFrame]: A list of pandas DataFrames, each representing a table in the document.

    Example:
        from docx import Document

        doc = Document("document.docx")
        tables = __extract_tables(doc)
    """
    dataframes: list[DataFrame] = []

    for table in doc.tables:
        data = []
        for row in table.rows:
            data.append([cell.text.strip() for cell in row.cells])
        df = DataFrame(data)
        dataframes.append(df)

    return dataframes

__linux_convert_doc_to_docx(docx_path, file) ¤

Converts a .doc file to .docx format using LibreOffice on Linux.

Source code in autocorpus/word.py
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
def __linux_convert_doc_to_docx(docx_path: Path, file: Path) -> Path | None:
    """Converts a .doc file to .docx format using LibreOffice on Linux."""
    try:
        result = subprocess.run(
            [
                "soffice",
                "--headless",
                "--convert-to",
                "docx",
                "--outdir",
                str(docx_path.parent),
                str(file),
            ],
            check=True,
            capture_output=True,
            text=True,
        )
        logger.info(f"LibreOffice output: {result.stdout}")
        return docx_path
    except FileNotFoundError:
        logger.error(
            "LibreOffice ('soffice') not found. Please install it to enable DOC to DOCX conversion."
        )
        return None
    except subprocess.CalledProcessError as e:
        logger.exception(f"LibreOffice failed to convert '{file}': {e.stderr}")
        return None

__macos_convert_doc_to_docx(docx_path, file) ¤

Converts a .doc file to .docx format using AppleScript on macOS.

Source code in autocorpus/word.py
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
def __macos_convert_doc_to_docx(docx_path: Path, file: Path) -> Path | None:
    """Converts a .doc file to .docx format using AppleScript on macOS."""
    try:
        applescript = f'''
        tell application "Microsoft Word"
            open "{__escape_applescript_path(file)}"
            save as active document file name "{__escape_applescript_path(docx_path)}" file format format document
            close active document saving no
        end tell
        '''
        subprocess.run(["osascript", "-e", applescript], check=True)
        logger.info(
            f"Successfully converted '{file}' to '{docx_path}' using Word on macOS."
        )
        return docx_path
    except FileNotFoundError:
        logger.error(
            "osascript not found. Ensure you have AppleScript and Microsoft Word installed on macOS."
        )
        return None
    except subprocess.CalledProcessError as e:
        logger.exception(f"AppleScript failed to convert '{file}': {e}")
        return None

__windows_convert_doc_to_docx(docx_path, file) ¤

Converts a .doc file to .docx format using Microsoft Word on Windows.

Source code in autocorpus/word.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
def __windows_convert_doc_to_docx(docx_path: Path, file: Path) -> Path | None:
    """Converts a .doc file to .docx format using Microsoft Word on Windows."""
    try:
        import win32com.client
    except ImportError:
        logger.error(
            "pywin32 is required to convert Word documents on Windows. Please install it via 'pip install pywin32'."
        )
        return None

    word = None
    try:
        word = win32com.client.DispatchEx("Word.Application")
        doc = word.Documents.Open(str(file))
        doc.SaveAs(str(docx_path), 16)  # 16 = wdFormatDocumentDefault (.docx)
        doc.Close()
        logger.info(
            f"Successfully converted '{file}' to '{docx_path}' using Word on Windows."
        )
        return docx_path
    except Exception as e:
        logger.exception(f"Failed to convert '{file}' on Windows: {e}")
        return None
    finally:
        if word:
            try:
                word.Quit()
            except Exception as quit_err:
                logger.warning(f"Could not quit Word application cleanly: {quit_err}")

extract_word_content(file_path) ¤

Extracts text from a .doc file by converting it to .docx and processing with python-docx.

Source code in autocorpus/word.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
def extract_word_content(file_path: Path):
    """Extracts text from a .doc file by converting it to .docx and processing with python-docx."""
    if file_path.suffix.lower() not in [".doc", ".docx"]:
        raise ValueError("Input file must be a .doc file.")
    try:
        output_dir = Path(file_path).parent.absolute()
        # Check if the file is a .doc file
        if file_path.suffix.lower() == ".doc":
            docx_path = __convert_older_doc_file(file_path, output_dir)

        # Extract text from the resulting .docx file
        doc = Document(str(docx_path))
        tables = __extract_tables(doc)
        text_sizes = set(
            [
                int(x.style.font.size)
                for x in doc.paragraphs
                if x.style and x.style.font.size
            ]
        )
        paragraphs = [
            WordText(
                x.text,
                True
                if text_sizes
                and x.style
                and x.style.font.size
                and int(x.style.font.size) > min(text_sizes)
                else False,
            )
            for x in doc.paragraphs
        ]
        bioc_text: BioCCollection | None = None
        bioc_tables: BioCTableCollection | None = None

        if paragraphs:
            bioc_text = BioCTextConverter.build_bioc(paragraphs, str(file_path), "word")

        if tables:
            bioc_tables = BioCTableConverter.build_bioc(tables, str(file_path))

        os.unlink(str(docx_path))
        return bioc_text, bioc_tables
    except FileNotFoundError:
        logger.error(
            "LibreOffice 'soffice' command not found. Ensure it is installed and in your PATH."
        )
    except Exception as e:
        logger.error(f"Error processing file {file_path}: {e}")