Skip to content

pdf

autocorpus.pdf ¤

Functionality for processing PDF files.

Attributes¤

Classes¤

Functions¤

extract_pdf_content(file_path) ¤

Extracts content from a PDF file.

Parameters:

Name Type Description Default
file_path Path

Path to the PDF file.

required

Returns:

Type Description
BioCCollection

A tuple of BioCTextConverter and BioCTableConverter objects containing

BioCTableCollection

the extracted text and tables.

Raises:

Type Description
RuntimeError

If the PDF converter is not initialized.

Source code in autocorpus/pdf.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
def extract_pdf_content(
    file_path: Path,
) -> tuple[BioCCollection, BioCTableCollection]:
    """Extracts content from a PDF file.

    Args:
        file_path (Path): Path to the PDF file.

    Returns:
        A tuple of BioCTextConverter and BioCTableConverter objects containing
        the extracted text and tables.

    Raises:
        RuntimeError: If the PDF converter is not initialized.
    """
    bioc_text, bioc_tables = None, None

    pdf_converter = _get_pdf_converter()
    if not pdf_converter:
        message = "PDF converter not initialized."
        logger.error(message)
        raise RuntimeError(message)

    # extract text from PDF
    rendered = pdf_converter(str(file_path))
    text, _, _ = text_from_rendered(rendered)
    # separate text and tables
    text, tables = _extract_table_from_pdf_text(text)
    # format data for BioC
    bioc_text = BioCTextConverter.build_bioc(text, str(file_path), "pdf")
    bioc_tables = BioCTableConverter.build_bioc(tables, str(file_path))

    return bioc_text, bioc_tables