excel.py 25.0 KB

原文件审查历史永久链接

"""
Utilities for conversion to writer-agnostic Excel representation.
"""

from functools import reduce
import itertools
import re
from typing import Callable, Dict, Optional, Sequence, Union
import warnings

import numpy as np

from pandas._typing import Label

from pandas.core.dtypes import missing
from pandas.core.dtypes.common import is_float, is_scalar
from pandas.core.dtypes.generic import ABCIndex

from pandas import DataFrame, Index, MultiIndex, PeriodIndex
import pandas.core.common as com

from pandas.io.common import stringify_path
from pandas.io.formats.css import CSSResolver, CSSWarning
from pandas.io.formats.format import get_level_lengths
from pandas.io.formats.printing import pprint_thing


class ExcelCell:
    __fields__ = ("row", "col", "val", "style", "mergestart", "mergeend")
    __slots__ = __fields__

    def __init__(
        self, row: int, col: int, val, style=None, mergestart=None, mergeend=None
    ):
        self.row = row
        self.col = col
        self.val = val
        self.style = style
        self.mergestart = mergestart
        self.mergeend = mergeend


class CSSToExcelConverter:
    """
    A callable for converting CSS declarations to ExcelWriter styles

    Supports parts of CSS 2.2, with minimal CSS 3.0 support (e.g. text-shadow),
    focusing on font styling, backgrounds, borders and alignment.

    Operates by first computing CSS styles in a fairly generic
    way (see :meth:`compute_css`) then determining Excel style
    properties from CSS properties (see :meth:`build_xlstyle`).

    Parameters
    ----------
    inherited : str, optional
        CSS declarations understood to be the containing scope for the
        CSS processed by :meth:`__call__`.
    """

    # NB: Most of the methods here could be classmethods, as only __init__
    #     and __call__ make use of instance attributes.  We leave them as
    #     instancemethods so that users can easily experiment with extensions
    #     without monkey-patching.

    def __init__(self, inherited: Optional[str] = None):
        if inherited is not None:
            inherited = self.compute_css(inherited)

        self.inherited = inherited

    compute_css = CSSResolver()

    def __call__(self, declarations_str: str) -> Dict[str, Dict[str, str]]:
        """
        Convert CSS declarations to ExcelWriter style.

        Parameters
        ----------
        declarations_str : str
            List of CSS declarations.
            e.g. "font-weight: bold; background: blue"

        Returns
        -------
        xlstyle : dict
            A style as interpreted by ExcelWriter when found in
            ExcelCell.style.
        """
        # TODO: memoize?
        properties = self.compute_css(declarations_str, self.inherited)
        return self.build_xlstyle(properties)

    def build_xlstyle(self, props: Dict[str, str]) -> Dict[str, Dict[str, str]]:
        out = {
            "alignment": self.build_alignment(props),
            "border": self.build_border(props),
            "fill": self.build_fill(props),
            "font": self.build_font(props),
            "number_format": self.build_number_format(props),
        }

        # TODO: handle cell width and height: needs support in pandas.io.excel

        def remove_none(d: Dict[str, str]) -> None:
            """Remove key where value is None, through nested dicts"""
            for k, v in list(d.items()):
                if v is None:
                    del d[k]
                elif isinstance(v, dict):
                    remove_none(v)
                    if not v:
                        del d[k]

        remove_none(out)
        return out

    VERTICAL_MAP = {
        "top": "top",
        "text-top": "top",
        "middle": "center",
        "baseline": "bottom",
        "bottom": "bottom",
        "text-bottom": "bottom",
        # OpenXML also has 'justify', 'distributed'
    }

    def build_alignment(self, props) -> Dict[str, Optional[Union[bool, str]]]:
        # TODO: text-indent, padding-left -> alignment.indent
        return {
            "horizontal": props.get("text-align"),
            "vertical": self.VERTICAL_MAP.get(props.get("vertical-align")),
            "wrap_text": (
                None
                if props.get("white-space") is None
                else props["white-space"] not in ("nowrap", "pre", "pre-line")
            ),
        }

    def build_border(self, props: Dict) -> Dict[str, Dict[str, str]]:
        return {
            side: {
                "style": self._border_style(
                    props.get(f"border-{side}-style"),
                    props.get(f"border-{side}-width"),
                ),
                "color": self.color_to_excel(props.get(f"border-{side}-color")),
            }
            for side in ["top", "right", "bottom", "left"]
        }

    def _border_style(self, style: Optional[str], width):
        # convert styles and widths to openxml, one of:
        #       'dashDot'
        #       'dashDotDot'
        #       'dashed'
        #       'dotted'
        #       'double'
        #       'hair'
        #       'medium'
        #       'mediumDashDot'
        #       'mediumDashDotDot'
        #       'mediumDashed'
        #       'slantDashDot'
        #       'thick'
        #       'thin'
        if width is None and style is None:
            return None
        if style == "none" or style == "hidden":
            return None

        if width is None:
            width = "2pt"
        width = float(width[:-2])
        if width < 1e-5:
            return None
        elif width < 1.3:
            width_name = "thin"
        elif width < 2.8:
            width_name = "medium"
        else:
            width_name = "thick"

        if style in (None, "groove", "ridge", "inset", "outset"):
            # not handled
            style = "solid"

        if style == "double":
            return "double"
        if style == "solid":
            return width_name
        if style == "dotted":
            if width_name in ("hair", "thin"):
                return "dotted"
            return "mediumDashDotDot"
        if style == "dashed":
            if width_name in ("hair", "thin"):
                return "dashed"
            return "mediumDashed"

    def build_fill(self, props: Dict[str, str]):
        # TODO: perhaps allow for special properties
        #       -excel-pattern-bgcolor and -excel-pattern-type
        fill_color = props.get("background-color")
        if fill_color not in (None, "transparent", "none"):
            return {"fgColor": self.color_to_excel(fill_color), "patternType": "solid"}

    BOLD_MAP = {
        "bold": True,
        "bolder": True,
        "600": True,
        "700": True,
        "800": True,
        "900": True,
        "normal": False,
        "lighter": False,
        "100": False,
        "200": False,
        "300": False,
        "400": False,
        "500": False,
    }
    ITALIC_MAP = {"normal": False, "italic": True, "oblique": True}

    def build_font(self, props) -> Dict[str, Optional[Union[bool, int, str]]]:
        size = props.get("font-size")
        if size is not None:
            assert size.endswith("pt")
            size = float(size[:-2])

        font_names_tmp = re.findall(
            r"""(?x)
            (
            "(?:[^"]|\\")+"
            |
            '(?:[^']|\\')+'
            |
            [^'",]+
            )(?=,|\s*$)
        """,
            props.get("font-family", ""),
        )
        font_names = []
        for name in font_names_tmp:
            if name[:1] == '"':
                name = name[1:-1].replace('\\"', '"')
            elif name[:1] == "'":
                name = name[1:-1].replace("\\'", "'")
            else:
                name = name.strip()
            if name:
                font_names.append(name)

        family = None
        for name in font_names:
            if name == "serif":
                family = 1  # roman
                break
            elif name == "sans-serif":
                family = 2  # swiss
                break
            elif name == "cursive":
                family = 4  # script
                break
            elif name == "fantasy":
                family = 5  # decorative
                break

        decoration = props.get("text-decoration")
        if decoration is not None:
            decoration = decoration.split()
        else:
            decoration = ()

        return {
            "name": font_names[0] if font_names else None,
            "family": family,
            "size": size,
            "bold": self.BOLD_MAP.get(props.get("font-weight")),
            "italic": self.ITALIC_MAP.get(props.get("font-style")),
            "underline": ("single" if "underline" in decoration else None),
            "strike": ("line-through" in decoration) or None,
            "color": self.color_to_excel(props.get("color")),
            # shadow if nonzero digit before shadow color
            "shadow": (
                bool(re.search("^[^#(]*[1-9]", props["text-shadow"]))
                if "text-shadow" in props
                else None
            ),
            # FIXME: dont leave commented-out
            # 'vertAlign':,
            # 'charset': ,
            # 'scheme': ,
            # 'outline': ,
            # 'condense': ,
        }

    NAMED_COLORS = {
        "maroon": "800000",
        "brown": "A52A2A",
        "red": "FF0000",
        "pink": "FFC0CB",
        "orange": "FFA500",
        "yellow": "FFFF00",
        "olive": "808000",
        "green": "008000",
        "purple": "800080",
        "fuchsia": "FF00FF",
        "lime": "00FF00",
        "teal": "008080",
        "aqua": "00FFFF",
        "blue": "0000FF",
        "navy": "000080",
        "black": "000000",
        "gray": "808080",
        "grey": "808080",
        "silver": "C0C0C0",
        "white": "FFFFFF",
    }

    def color_to_excel(self, val: Optional[str]):
        if val is None:
            return None
        if val.startswith("#") and len(val) == 7:
            return val[1:].upper()
        if val.startswith("#") and len(val) == 4:
            return (val[1] * 2 + val[2] * 2 + val[3] * 2).upper()
        try:
            return self.NAMED_COLORS[val]
        except KeyError:
            warnings.warn(f"Unhandled color format: {repr(val)}", CSSWarning)

    def build_number_format(self, props: Dict) -> Dict[str, Optional[str]]:
        return {"format_code": props.get("number-format")}


class ExcelFormatter:
    """
    Class for formatting a DataFrame to a list of ExcelCells,

    Parameters
    ----------
    df : DataFrame or Styler
    na_rep: na representation
    float_format : string, default None
            Format string for floating point numbers
    cols : sequence, optional
        Columns to write
    header : boolean or list of string, default True
        Write out column names. If a list of string is given it is
        assumed to be aliases for the column names
    index : boolean, default True
        output row names (index)
    index_label : string or sequence, default None
            Column label for index column(s) if desired. If None is given, and
            `header` and `index` are True, then the index names are used. A
            sequence should be given if the DataFrame uses MultiIndex.
    merge_cells : boolean, default False
            Format MultiIndex and Hierarchical Rows as merged cells.
    inf_rep : string, default `'inf'`
        representation for np.inf values (which aren't representable in Excel)
        A `'-'` sign will be added in front of -inf.
    style_converter : callable, optional
        This translates Styler styles (CSS) into ExcelWriter styles.
        Defaults to ``CSSToExcelConverter()``.
        It should have signature css_declarations string -> excel style.
        This is only called for body cells.
    """

    max_rows = 2 ** 20
    max_cols = 2 ** 14

    def __init__(
        self,
        df,
        na_rep: str = "",
        float_format: Optional[str] = None,
        cols: Optional[Sequence[Label]] = None,
        header: Union[Sequence[Label], bool] = True,
        index: bool = True,
        index_label: Optional[Union[Label, Sequence[Label]]] = None,
        merge_cells: bool = False,
        inf_rep: str = "inf",
        style_converter: Optional[Callable] = None,
    ):
        self.rowcounter = 0
        self.na_rep = na_rep
        if not isinstance(df, DataFrame):
            self.styler = df
            df = df.data
            if style_converter is None:
                style_converter = CSSToExcelConverter()
            self.style_converter = style_converter
        else:
            self.styler = None
        self.df = df
        if cols is not None:

            # all missing, raise
            if not len(Index(cols) & df.columns):
                raise KeyError("passes columns are not ALL present dataframe")

            if len(Index(cols) & df.columns) != len(cols):
                # Deprecated in GH#17295, enforced in 1.0.0
                raise KeyError("Not all names specified in 'columns' are found")

            self.df = df.reindex(columns=cols)

        self.columns = self.df.columns
        self.float_format = float_format
        self.index = index
        self.index_label = index_label
        self.header = header
        self.merge_cells = merge_cells
        self.inf_rep = inf_rep

    @property
    def header_style(self):
        return {
            "font": {"bold": True},
            "borders": {
                "top": "thin",
                "right": "thin",
                "bottom": "thin",
                "left": "thin",
            },
            "alignment": {"horizontal": "center", "vertical": "top"},
        }

    def _format_value(self, val):
        if is_scalar(val) and missing.isna(val):
            val = self.na_rep
        elif is_float(val):
            if missing.isposinf_scalar(val):
                val = self.inf_rep
            elif missing.isneginf_scalar(val):
                val = f"-{self.inf_rep}"
            elif self.float_format is not None:
                val = float(self.float_format % val)
        if getattr(val, "tzinfo", None) is not None:
            raise ValueError(
                "Excel does not support datetimes with "
                "timezones. Please ensure that datetimes "
                "are timezone unaware before writing to Excel."
            )
        return val

    def _format_header_mi(self):
        if self.columns.nlevels > 1:
            if not self.index:
                raise NotImplementedError(
                    "Writing to Excel with MultiIndex columns and no "
                    "index ('index'=False) is not yet implemented."
                )

        has_aliases = isinstance(self.header, (tuple, list, np.ndarray, ABCIndex))
        if not (has_aliases or self.header):
            return

        columns = self.columns
        level_strs = columns.format(
            sparsify=self.merge_cells, adjoin=False, names=False
        )
        level_lengths = get_level_lengths(level_strs)
        coloffset = 0
        lnum = 0

        if self.index and isinstance(self.df.index, MultiIndex):
            coloffset = len(self.df.index[0]) - 1

        if self.merge_cells:
            # Format multi-index as a merged cells.
            for lnum in range(len(level_lengths)):
                name = columns.names[lnum]
                yield ExcelCell(lnum, coloffset, name, self.header_style)

            for lnum, (spans, levels, level_codes) in enumerate(
                zip(level_lengths, columns.levels, columns.codes)
            ):
                values = levels.take(level_codes)
                for i in spans:
                    if spans[i] > 1:
                        yield ExcelCell(
                            lnum,
                            coloffset + i + 1,
                            values[i],
                            self.header_style,
                            lnum,
                            coloffset + i + spans[i],
                        )
                    else:
                        yield ExcelCell(
                            lnum, coloffset + i + 1, values[i], self.header_style
                        )
        else:
            # Format in legacy format with dots to indicate levels.
            for i, values in enumerate(zip(*level_strs)):
                v = ".".join(map(pprint_thing, values))
                yield ExcelCell(lnum, coloffset + i + 1, v, self.header_style)

        self.rowcounter = lnum

    def _format_header_regular(self):
        has_aliases = isinstance(self.header, (tuple, list, np.ndarray, ABCIndex))
        if has_aliases or self.header:
            coloffset = 0

            if self.index:
                coloffset = 1
                if isinstance(self.df.index, MultiIndex):
                    coloffset = len(self.df.index[0])

            colnames = self.columns
            if has_aliases:
                if len(self.header) != len(self.columns):
                    raise ValueError(
                        f"Writing {len(self.columns)} cols but got {len(self.header)} "
                        "aliases"
                    )
                else:
                    colnames = self.header

            for colindex, colname in enumerate(colnames):
                yield ExcelCell(
                    self.rowcounter, colindex + coloffset, colname, self.header_style
                )

    def _format_header(self):
        if isinstance(self.columns, MultiIndex):
            gen = self._format_header_mi()
        else:
            gen = self._format_header_regular()

        gen2 = ()
        if self.df.index.names:
            row = [x if x is not None else "" for x in self.df.index.names] + [
                ""
            ] * len(self.columns)
            if reduce(lambda x, y: x and y, map(lambda x: x != "", row)):
                gen2 = (
                    ExcelCell(self.rowcounter, colindex, val, self.header_style)
                    for colindex, val in enumerate(row)
                )
                self.rowcounter += 1
        return itertools.chain(gen, gen2)

    def _format_body(self):
        if isinstance(self.df.index, MultiIndex):
            return self._format_hierarchical_rows()
        else:
            return self._format_regular_rows()

    def _format_regular_rows(self):
        has_aliases = isinstance(self.header, (tuple, list, np.ndarray, ABCIndex))
        if has_aliases or self.header:
            self.rowcounter += 1

        # output index and index_label?
        if self.index:
            # check aliases
            # if list only take first as this is not a MultiIndex
            if self.index_label and isinstance(
                self.index_label, (list, tuple, np.ndarray, Index)
            ):
                index_label = self.index_label[0]
            # if string good to go
            elif self.index_label and isinstance(self.index_label, str):
                index_label = self.index_label
            else:
                index_label = self.df.index.names[0]

            if isinstance(self.columns, MultiIndex):
                self.rowcounter += 1

            if index_label and self.header is not False:
                yield ExcelCell(self.rowcounter - 1, 0, index_label, self.header_style)

            # write index_values
            index_values = self.df.index
            if isinstance(self.df.index, PeriodIndex):
                index_values = self.df.index.to_timestamp()

            for idx, idxval in enumerate(index_values):
                yield ExcelCell(self.rowcounter + idx, 0, idxval, self.header_style)

            coloffset = 1
        else:
            coloffset = 0

        for cell in self._generate_body(coloffset):
            yield cell

    def _format_hierarchical_rows(self):
        has_aliases = isinstance(self.header, (tuple, list, np.ndarray, ABCIndex))
        if has_aliases or self.header:
            self.rowcounter += 1

        gcolidx = 0

        if self.index:
            index_labels = self.df.index.names
            # check for aliases
            if self.index_label and isinstance(
                self.index_label, (list, tuple, np.ndarray, Index)
            ):
                index_labels = self.index_label

            # MultiIndex columns require an extra row
            # with index names (blank if None) for
            # unambiguous round-trip, unless not merging,
            # in which case the names all go on one row Issue #11328
            if isinstance(self.columns, MultiIndex) and self.merge_cells:
                self.rowcounter += 1

            # if index labels are not empty go ahead and dump
            if com.any_not_none(*index_labels) and self.header is not False:

                for cidx, name in enumerate(index_labels):
                    yield ExcelCell(self.rowcounter - 1, cidx, name, self.header_style)

            if self.merge_cells:
                # Format hierarchical rows as merged cells.
                level_strs = self.df.index.format(
                    sparsify=True, adjoin=False, names=False
                )
                level_lengths = get_level_lengths(level_strs)

                for spans, levels, level_codes in zip(
                    level_lengths, self.df.index.levels, self.df.index.codes
                ):

                    values = levels.take(
                        level_codes, allow_fill=levels._can_hold_na, fill_value=True
                    )

                    for i in spans:
                        if spans[i] > 1:
                            yield ExcelCell(
                                self.rowcounter + i,
                                gcolidx,
                                values[i],
                                self.header_style,
                                self.rowcounter + i + spans[i] - 1,
                                gcolidx,
                            )
                        else:
                            yield ExcelCell(
                                self.rowcounter + i,
                                gcolidx,
                                values[i],
                                self.header_style,
                            )
                    gcolidx += 1

            else:
                # Format hierarchical rows with non-merged values.
                for indexcolvals in zip(*self.df.index):
                    for idx, indexcolval in enumerate(indexcolvals):
                        yield ExcelCell(
                            self.rowcounter + idx,
                            gcolidx,
                            indexcolval,
                            self.header_style,
                        )
                    gcolidx += 1

        for cell in self._generate_body(gcolidx):
            yield cell

    def _generate_body(self, coloffset: int):
        if self.styler is None:
            styles = None
        else:
            styles = self.styler._compute().ctx
            if not styles:
                styles = None
        xlstyle = None

        # Write the body of the frame data series by series.
        for colidx in range(len(self.columns)):
            series = self.df.iloc[:, colidx]
            for i, val in enumerate(series):
                if styles is not None:
                    xlstyle = self.style_converter(";".join(styles[i, colidx]))
                yield ExcelCell(self.rowcounter + i, colidx + coloffset, val, xlstyle)

    def get_formatted_cells(self):
        for cell in itertools.chain(self._format_header(), self._format_body()):
            cell.val = self._format_value(cell.val)
            yield cell

    def write(
        self,
        writer,
        sheet_name="Sheet1",
        startrow=0,
        startcol=0,
        freeze_panes=None,
        engine=None,
    ):
        """
        writer : string or ExcelWriter object
            File path or existing ExcelWriter
        sheet_name : string, default 'Sheet1'
            Name of sheet which will contain DataFrame
        startrow :
            upper left cell row to dump data frame
        startcol :
            upper left cell column to dump data frame
        freeze_panes : tuple of integer (length 2), default None
            Specifies the one-based bottommost row and rightmost column that
            is to be frozen
        engine : string, default None
            write engine to use if writer is a path - you can also set this
            via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``,
            and ``io.excel.xlsm.writer``.
        """
        from pandas.io.excel import ExcelWriter

        num_rows, num_cols = self.df.shape
        if num_rows > self.max_rows or num_cols > self.max_cols:
            raise ValueError(
                f"This sheet is too large! Your sheet size is: {num_rows}, {num_cols} "
                f"Max sheet size is: {self.max_rows}, {self.max_cols}"
            )

        if isinstance(writer, ExcelWriter):
            need_save = False
        else:
            writer = ExcelWriter(stringify_path(writer), engine=engine)
            need_save = True

        formatted_cells = self.get_formatted_cells()
        writer.write_cells(
            formatted_cells,
            sheet_name,
            startrow=startrow,
            startcol=startcol,
            freeze_panes=freeze_panes,
        )
        if need_save:
            writer.save()