This module provides functionality to extract text and tables from Word documents (.doc and .docx).
It includes methods to handle older .doc files by converting them to .docx format and processing them.
Attributes
Classes
Functions
__convert_older_doc_file(file, output_dir)
Converts an older .doc file to .docx format using platform-specific methods.
Source code in autocorpus/word.py
144
145
146
147
148
149
150
151
152
153
154 | def __convert_older_doc_file(file: Path, output_dir: Path) -> Path | None:
"""Converts an older .doc file to .docx format using platform-specific methods."""
operating_system = platform.system()
docx_path = output_dir / file.with_suffix(".docx").name
if operating_system == "Windows":
return __windows_convert_doc_to_docx(docx_path, file)
elif operating_system == "Darwin": # macOS
return __macos_convert_doc_to_docx(docx_path, file)
else:
return __linux_convert_doc_to_docx(docx_path, file) # Fallback to Linux method
|
Extracts tables from a .docx document as a list of DataFrames.
Parameters:
Name |
Type |
Description |
Default |
doc
|
Document
|
The Document object representing the .docx document.
|
required
|
Returns:
Type |
Description |
list[DataFrame]
|
List[pd.DataFrame]: A list of pandas DataFrames, each representing a table in the document.
|
Example
from docx import Document
doc = Document("document.docx")
tables = __extract_tables(doc)
Source code in autocorpus/word.py
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49 | def __extract_tables(doc: DocumentObject) -> list[DataFrame]:
"""Extracts tables from a .docx document as a list of DataFrames.
Args:
doc: The Document object representing the .docx document.
Returns:
List[pd.DataFrame]: A list of pandas DataFrames, each representing a table in the document.
Example:
from docx import Document
doc = Document("document.docx")
tables = __extract_tables(doc)
"""
dataframes: list[DataFrame] = []
for table in doc.tables:
data = []
for row in table.rows:
data.append([cell.text.strip() for cell in row.cells])
df = DataFrame(data)
dataframes.append(df)
return dataframes
|
__linux_convert_doc_to_docx(docx_path, file)
Converts a .doc file to .docx format using LibreOffice on Linux.
Source code in autocorpus/word.py
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109 | def __linux_convert_doc_to_docx(docx_path: Path, file: Path) -> Path | None:
"""Converts a .doc file to .docx format using LibreOffice on Linux."""
try:
result = subprocess.run(
[
"soffice",
"--headless",
"--convert-to",
"docx",
"--outdir",
str(docx_path.parent),
str(file),
],
check=True,
capture_output=True,
text=True,
)
logger.info(f"LibreOffice output: {result.stdout}")
return docx_path
except FileNotFoundError:
logger.error(
"LibreOffice ('soffice') not found. Please install it to enable DOC to DOCX conversion."
)
return None
except subprocess.CalledProcessError as e:
logger.exception(f"LibreOffice failed to convert '{file}': {e.stderr}")
return None
|
__macos_convert_doc_to_docx(docx_path, file)
Converts a .doc file to .docx format using AppleScript on macOS.
Source code in autocorpus/word.py
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141 | def __macos_convert_doc_to_docx(docx_path: Path, file: Path) -> Path | None:
"""Converts a .doc file to .docx format using AppleScript on macOS."""
try:
applescript = f'''
tell application "Microsoft Word"
open "{__escape_applescript_path(file)}"
save as active document file name "{__escape_applescript_path(docx_path)}" file format format document
close active document saving no
end tell
'''
subprocess.run(["osascript", "-e", applescript], check=True)
logger.info(
f"Successfully converted '{file}' to '{docx_path}' using Word on macOS."
)
return docx_path
except FileNotFoundError:
logger.error(
"osascript not found. Ensure you have AppleScript and Microsoft Word installed on macOS."
)
return None
except subprocess.CalledProcessError as e:
logger.exception(f"AppleScript failed to convert '{file}': {e}")
return None
|
__windows_convert_doc_to_docx(docx_path, file)
Converts a .doc file to .docx format using Microsoft Word on Windows.
Source code in autocorpus/word.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80 | def __windows_convert_doc_to_docx(docx_path: Path, file: Path) -> Path | None:
"""Converts a .doc file to .docx format using Microsoft Word on Windows."""
try:
import win32com.client
except ImportError:
logger.error(
"pywin32 is required to convert Word documents on Windows. Please install it via 'pip install pywin32'."
)
return None
word = None
try:
word = win32com.client.DispatchEx("Word.Application")
doc = word.Documents.Open(str(file))
doc.SaveAs(str(docx_path), 16) # 16 = wdFormatDocumentDefault (.docx)
doc.Close()
logger.info(
f"Successfully converted '{file}' to '{docx_path}' using Word on Windows."
)
return docx_path
except Exception as e:
logger.exception(f"Failed to convert '{file}' on Windows: {e}")
return None
finally:
if word:
try:
word.Quit()
except Exception as quit_err:
logger.warning(f"Could not quit Word application cleanly: {quit_err}")
|
Extracts text from a .doc file by converting it to .docx and processing with python-docx.
Source code in autocorpus/word.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205 | def extract_word_content(file_path: Path):
"""Extracts text from a .doc file by converting it to .docx and processing with python-docx."""
if file_path.suffix.lower() not in [".doc", ".docx"]:
raise ValueError("Input file must be a .doc file.")
try:
output_dir = Path(file_path).parent.absolute()
# Check if the file is a .doc file
if file_path.suffix.lower() == ".doc":
docx_path = __convert_older_doc_file(file_path, output_dir)
# Extract text from the resulting .docx file
doc = Document(str(docx_path))
tables = __extract_tables(doc)
text_sizes = set(
[
int(x.style.font.size)
for x in doc.paragraphs
if x.style and x.style.font.size
]
)
paragraphs = [
WordText(
x.text,
True
if text_sizes
and x.style
and x.style.font.size
and int(x.style.font.size) > min(text_sizes)
else False,
)
for x in doc.paragraphs
]
bioc_text: BioCCollection | None = None
bioc_tables: BioCTableCollection | None = None
if paragraphs:
bioc_text = BioCTextConverter.build_bioc(paragraphs, str(file_path), "word")
if tables:
bioc_tables = BioCTableConverter.build_bioc(tables, str(file_path))
os.unlink(str(docx_path))
return bioc_text, bioc_tables
except FileNotFoundError:
logger.error(
"LibreOffice 'soffice' command not found. Ensure it is installed and in your PATH."
)
except Exception as e:
logger.error(f"Error processing file {file_path}: {e}")
|