Skip to content

spreadsheet

autocorpus.spreadsheet ¤

Module for extracting and converting spreadsheet content into BioC tables.

Attributes¤

Classes¤

Functions¤

convert_datetime_to_string(df) ¤

Convert all datetime objects in a DataFrame to string format.

Parameters:

Name Type Description Default
df DataFrame

The input DataFrame.

required

Returns:

Name Type Description
DataFrame DataFrame

A DataFrame with datetime columns converted to string.

Source code in autocorpus/spreadsheet.py
13
14
15
16
17
18
19
20
21
22
23
24
25
def convert_datetime_to_string(df: DataFrame) -> DataFrame:
    """Convert all datetime objects in a DataFrame to string format.

    Args:
        df: The input DataFrame.

    Returns:
        DataFrame: A DataFrame with datetime columns converted to string.
    """
    for col in df.select_dtypes(include=["datetime64[ns]", "datetime64"]):
        df[col] = df[col].astype(str)
        df[col] = df[col].fillna("")
    return df

extract_spreadsheet_content(filename) ¤

Process an Excel file and extract each sheet as a separate table.

Parameters:

Name Type Description Default
filename Path

The path of the Excel file to be processed.

required

Returns:

Type Description
BioCTableCollection | None

A list of tables, where each table is represented as a Pandas DataFrame.

Raises:

Type Description
Exception

If there is an error while processing the Excel file.

Source code in autocorpus/spreadsheet.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
def extract_spreadsheet_content(filename: Path) -> BioCTableCollection | None:
    """Process an Excel file and extract each sheet as a separate table.

    Args:
        filename: The path of the Excel file to be processed.

    Returns:
        A list of tables, where each table is represented as a Pandas DataFrame.

    Raises:
        Exception: If there is an error while processing the Excel file.
    """
    tables: list[DataFrame] = []
    tables_bioc: BioCTableCollection | None = None
    try:
        # read the Excel file into a Pandas dataframe
        xls = pd.ExcelFile(filename)

        # loop through each sheet in the Excel file
        for sheet_name in xls.sheet_names:
            # read the sheet into a Pandas dataframe
            df = pd.read_excel(filename, sheet_name=sheet_name)
            df = convert_datetime_to_string(df)

            # Replace NaNs with empty string, then convert everything to string
            df = df.where(pd.notnull(df), "").astype(str)
            # convert all columns to string type for consistency and compatibility
            df = df.astype(str)
            # add the dataframe to the list of tables
            tables.append(df)
    except ImportError as ie:
        logger.error(
            msg=f"Failed to process the file {filename.name} due to the following missing Pandas dependency: {ie}"
        )

    if tables:
        tables_bioc = BioCTableConverter.build_bioc(tables, str(filename))

    return tables_bioc