Source code for statsmodels.graphics.mosaicplot

"""Create a mosaic plot from a contingency table.

It allows to visualize multivariate categorical data in a rigorous
and informative way.

"""
# Author: Enrico Giampieri - 21 Jan 2013

from statsmodels.compat.python import (iteritems, iterkeys, lrange, string_types, lzip,
itervalues, zip, range)
import numpy as np
from collections import OrderedDict
from itertools import product

from numpy import iterable, r_, cumsum, array
from statsmodels.graphics import utils
from pandas import DataFrame

__all__ = ["mosaic"]

def _normalize_split(proportion):
"""
return a list of proportions of the available space given the division
if only a number is given, it will assume a split in two pieces
"""
if not iterable(proportion):
if proportion == 0:
proportion = array([0.0, 1.0])
elif proportion >= 1:
proportion = array([1.0, 0.0])
elif proportion < 0:
raise ValueError("proportions should be positive,"
"given value: {}".format(proportion))
else:
proportion = array([proportion, 1.0 - proportion])
proportion = np.asarray(proportion, dtype=float)
if np.any(proportion < 0):
raise ValueError("proportions should be positive,"
"given value: {}".format(proportion))
if np.allclose(proportion, 0):
raise ValueError("at least one proportion should be "
"greater than zero".format(proportion))
# ok, data are meaningful, so go on
if len(proportion) < 2:
return array([0.0, 1.0])
left = r_[0, cumsum(proportion)]
left /= left[-1] * 1.0
return left

def _split_rect(x, y, width, height, proportion, horizontal=True, gap=0.05):
"""
Split the given rectangle in n segments whose proportion is specified
along the given axis if a gap is inserted, they will be separated by a
certain amount of space, retaining the relative proportion between them
a gap of 1 correspond to a plot that is half void and the remaining half
space is proportionally divided among the pieces.
"""
x, y, w, h = float(x), float(y), float(width), float(height)
if (w < 0) or (h < 0):
raise ValueError("dimension of the square less than"
"zero w={} h=()".format(w, h))
proportions = _normalize_split(proportion)

# extract the starting point and the dimension of each subdivision
# in respect to the unit square
starting = proportions[:-1]
amplitude = proportions[1:] - starting

# how much each extrema is going to be displaced due to gaps
starting += gap * np.arange(len(proportions) - 1)

# how much the squares plus the gaps are extended
extension = starting[-1] + amplitude[-1] - starting[0]

# normalize everything for fit again in the original dimension
starting /= extension
amplitude /= extension

# bring everything to the original square
starting = (x if horizontal else y) + starting * (w if horizontal else h)
amplitude = amplitude * (w if horizontal else h)

# create each 4-tuple for each new block
results = [(s, y, a, h) if horizontal else (x, s, w, a)
for s, a in zip(starting, amplitude)]
return results

def _reduce_dict(count_dict, partial_key):
"""
Make partial sum on a counter dict.
Given a match for the beginning of the category, it will sum each value.
"""
L = len(partial_key)
count = sum(v for k, v in iteritems(count_dict) if k[:L] == partial_key)
return count

def _key_splitting(rect_dict, keys, values, key_subset, horizontal, gap):
"""
Given a dictionary where each entry  is a rectangle, a list of key and
value (count of elements in each category) it split each rect accordingly,
as long as the key start with the tuple key_subset.  The other keys are
returned without modification.
"""
result = OrderedDict()
L = len(key_subset)
for name, (x, y, w, h) in iteritems(rect_dict):
if key_subset == name[:L]:
# split base on the values given
divisions = _split_rect(x, y, w, h, values, horizontal, gap)
for key, rect in zip(keys, divisions):
result[name + (key,)] = rect
else:
result[name] = (x, y, w, h)
return result

def _tuplify(obj):
"""convert an object in a tuple of strings (even if it is not iterable,
like a single integer number, but keep the string healthy)
"""
if np.iterable(obj) and not isinstance(obj, string_types):
res = tuple(str(o) for o in obj)
else:
res = (str(obj),)
return res

def _categories_level(keys):
"""use the Ordered dict to implement a simple ordered set
return each level of each category
[[key_1_level_1,key_2_level_1],[key_1_level_2,key_2_level_2]]
"""
res = []
for i in zip(*(keys)):
tuplefied = _tuplify(i)
res.append(list(OrderedDict([(j, None) for j in tuplefied])))
return res

def _hierarchical_split(count_dict, horizontal=True, gap=0.05):
"""
Split a square in a hierarchical way given a contingency table.

Hierarchically split the unit square in alternate directions
in proportion to the subdivision contained in the contingency table
count_dict.  This is the function that actually perform the tiling
for the creation of the mosaic plot.  If the gap array has been specified
it will insert a corresponding amount of space (proportional to the
unit lenght), while retaining the proportionality of the tiles.

Parameters
----------
count_dict : dict
Dictionary containing the contingency table.
Each category should contain a non-negative number
with a tuple as index.  It expects that all the combination
of keys to be representes; if that is not true, will
automatically consider the missing values as 0
horizontal : bool
The starting direction of the split (by default along
the horizontal axis)
gap : float or array of floats
The list of gaps to be applied on each subdivision.
If the lenght of the given array is less of the number
of subcategories (or if it's a single number) it will extend
it with exponentially decreasing gaps

Returns
---------
base_rect : dict
A dictionary containing the result of the split.
To each key is associated a 4-tuple of coordinates
that are required to create the corresponding rectangle:

0 - x position of the lower left corner
1 - y position of the lower left corner
2 - width of the rectangle
3 - height of the rectangle
"""
# this is the unit square that we are going to divide
base_rect = OrderedDict([(tuple(), (0, 0, 1, 1))])
# get the list of each possible value for each level
categories_levels = _categories_level(list(iterkeys(count_dict)))
L = len(categories_levels)

# recreate the gaps vector starting from an int
if not np.iterable(gap):
gap = [gap / 1.5 ** idx for idx in range(L)]
# extend if it's too short
if len(gap) < L:
last = gap[-1]
gap = list(*gap) + [last / 1.5 ** idx for idx in range(L)]
# trim if it's too long
gap = gap[:L]
# put the count dictionay in order for the keys
# this will allow some code simplification
count_ordered = OrderedDict([(k, count_dict[k])
for k in list(product(*categories_levels))])
for cat_idx, cat_enum in enumerate(categories_levels):
# get the partial key up to the actual level
base_keys = list(product(*categories_levels[:cat_idx]))
for key in base_keys:
# for each partial and each value calculate how many
# observation we have in the counting dictionary
part_count = [_reduce_dict(count_ordered, key + (partial,))
for partial in cat_enum]
# reduce the gap for subsequents levels
new_gap = gap[cat_idx]
# split the given subkeys in the rectangle dictionary
base_rect = _key_splitting(base_rect, cat_enum, part_count, key,
horizontal, new_gap)
horizontal = not horizontal
return base_rect

def _single_hsv_to_rgb(hsv):
"""Transform a color from the hsv space to the rgb."""
from matplotlib.colors import hsv_to_rgb
return hsv_to_rgb(array(hsv).reshape(1, 1, 3)).reshape(3)

def _create_default_properties(data):
""""Create the default properties of the mosaic given the data
first it will varies the color hue (first category) then the color
saturation (second category) and then the color value
(third category).  If a fourth category is found, it will put
decoration on the rectangle.  Doesn't manage more than four
level of categories
"""
categories_levels = _categories_level(list(iterkeys(data)))
Nlevels = len(categories_levels)
# first level, the hue
L = len(categories_levels[0])
# hue = np.linspace(1.0, 0.0, L+1)[:-1]
hue = np.linspace(0.0, 1.0, L + 2)[:-2]
# second level, the saturation
L = len(categories_levels[1]) if Nlevels > 1 else 1
saturation = np.linspace(0.5, 1.0, L + 1)[:-1]
# third level, the value
L = len(categories_levels[2]) if Nlevels > 2 else 1
value = np.linspace(0.5, 1.0, L + 1)[:-1]
# fourth level, the hatch
L = len(categories_levels[3]) if Nlevels > 3 else 1
hatch = ['', '/', '-', '|', '+'][:L + 1]
# convert in list and merge with the levels
hue = lzip(list(hue), categories_levels[0])
saturation = lzip(list(saturation),
categories_levels[1] if Nlevels > 1 else [''])
value = lzip(list(value),
categories_levels[2] if Nlevels > 2 else [''])
hatch = lzip(list(hatch),
categories_levels[3] if Nlevels > 3 else [''])
# create the properties dictionary
properties = {}
for h, s, v, t in product(hue, saturation, value, hatch):
hv, hn = h
sv, sn = s
vv, vn = v
tv, tn = t
level = (hn,) + ((sn,) if sn else tuple())
level = level + ((vn,) if vn else tuple())
level = level + ((tn,) if tn else tuple())
hsv = array([hv, sv, vv])
prop = {'color': _single_hsv_to_rgb(hsv), 'hatch': tv, 'lw': 0}
properties[level] = prop
return properties

def _normalize_data(data, index):
"""normalize the data to a dict with tuples of strings as keys
right now it works with:

0 - dictionary (or equivalent mappable)
1 - pandas.Series with simple or hierarchical indexes
2 - numpy.ndarrays
3 - everything that can be converted to a numpy array
4 - pandas.DataFrame (via the _normalize_dataframe function)
"""
# if data is a dataframe we need to take a completely new road
# before coming back here. Use the hasattr to avoid importing
# pandas explicitly
if hasattr(data, 'pivot') and hasattr(data, 'groupby'):
data = _normalize_dataframe(data, index)
index = None
# can it be used as a dictionary?
try:
items = list(iteritems(data))
except AttributeError:
# ok, I cannot use the data as a dictionary
# Try to convert it to a numpy array, or die trying
data = np.asarray(data)
temp = OrderedDict()
for idx in np.ndindex(data.shape):
name = tuple(i for i in idx)
temp[name] = data[idx]
data = temp
items = list(iteritems(data))
# make all the keys a tuple, even if simple numbers
data = OrderedDict([_tuplify(k), v] for k, v in items)
categories_levels = _categories_level(list(iterkeys(data)))
# fill the void in the counting dictionary
indexes = product(*categories_levels)
contingency = OrderedDict([(k, data.get(k, 0)) for k in indexes])
data = contingency
# reorder the keys order according to the one specified by the user
# or if the index is None convert it into a simple list
# right now it doesn't do any check, but can be modified in the future
index = lrange(len(categories_levels)) if index is None else index
contingency = OrderedDict()
for key, value in iteritems(data):
new_key = tuple(key[i] for i in index)
contingency[new_key] = value
data = contingency
return data

def _normalize_dataframe(dataframe, index):
"""Take a pandas DataFrame and count the element present in the
given columns, return a hierarchical index on those columns
"""
#groupby the given keys, extract the same columns and count the element
# then collapse them with a mean
data = dataframe[index].dropna()
grouped = data.groupby(index, sort=False)
counted = grouped[index].count()
averaged = counted.mean(axis=1)
# Fill empty missing with 0, see GH5639
averaged = averaged.fillna(0.0)
return averaged

def _statistical_coloring(data):
"""evaluate colors from the indipendence properties of the matrix
It will encounter problem if one category has all zeros
"""
data = _normalize_data(data, None)
categories_levels = _categories_level(list(iterkeys(data)))
Nlevels = len(categories_levels)
total = 1.0 * sum(v for v in itervalues(data))
# count the proportion of observation
# for each level that has the given name
# at each level
levels_count = []
for level_idx in range(Nlevels):
proportion = {}
for level in categories_levels[level_idx]:
proportion[level] = 0.0
for key, value in iteritems(data):
if level == key[level_idx]:
proportion[level] += value
proportion[level] /= total
levels_count.append(proportion)
# for each key I obtain the expected value
# and it's standard deviation from a binomial distribution
# under the hipothesys of independence
expected = {}
for key, value in iteritems(data):
base = 1.0
for i, k in enumerate(key):
base *= levels_count[i][k]
expected[key] = base * total, np.sqrt(total * base * (1.0 - base))
# now we have the standard deviation of distance from the
# expected value for each tile. We create the colors from this
sigmas = dict((k, (data[k] - m) / s) for k, (m, s) in iteritems(expected))
props = {}
for key, dev in iteritems(sigmas):
red = 0.0 if dev < 0 else (dev / (1 + dev))
blue = 0.0 if dev > 0 else (dev / (-1 + dev))
green = (1.0 - red - blue) / 2.0
hatch = 'x' if dev > 2 else 'o' if dev < -2 else ''
props[key] = {'color': [red, green, blue], 'hatch': hatch}
return props

def _get_position(x, w, h, W):
if W == 0:
return x
return (x + w / 2.0) * w * h / W

def _create_labels(rects, horizontal, ax, rotation):
"""find the position of the label for each value of each category

right now it supports only up to the four categories

ax: the axis on which the label should be applied
rotation: the rotation list for each side
"""
categories = _categories_level(list(iterkeys(rects)))
if len(categories) > 4:
msg = ("maximum of 4 level supported for axes labeling... and 4"
"is already a lot of levels, are you sure you need them all?")
raise ValueError(msg)
labels = {}
#keep it fixed as will be used a lot of times
items = list(iteritems(rects))
vertical = not horizontal

#get the axis ticks and labels locator to put the correct values!
ax2 = ax.twinx()
ax3 = ax.twiny()
#this is the order of execution for horizontal disposition
ticks_pos = [ax.set_xticks, ax.set_yticks, ax3.set_xticks, ax2.set_yticks]
ticks_lab = [ax.set_xticklabels, ax.set_yticklabels,
ax3.set_xticklabels, ax2.set_yticklabels]
#for the vertical one, rotate it by one
if vertical:
ticks_pos = ticks_pos[1:] + ticks_pos[:1]
ticks_lab = ticks_lab[1:] + ticks_lab[:1]
#clean them
for pos, lab in zip(ticks_pos, ticks_lab):
pos([])
lab([])
#for each level, for each value in the level, take the mean of all
#the sublevel that correspond to that partial key
for level_idx, level in enumerate(categories):
#this dictionary keep the labels only for this level
level_ticks = dict()
for value in level:
#to which level it should refer to get the preceding
#values of labels? it's rather a tricky question...
#this is dependent on the side. It's a very crude management
#but I couldn't think a more general way...
if horizontal:
if level_idx == 3:
index_select = [-1, -1, -1]
else:
index_select = [+0, -1, -1]
else:
if level_idx == 3:
index_select = [+0, -1, +0]
else:
index_select = [-1, -1, -1]
#now I create the base key name and append the current value
#It will search on all the rects to find the corresponding one
#and use them to evaluate the mean position
basekey = tuple(categories[i][index_select[i]]
for i in range(level_idx))
basekey = basekey + (value,)
subset = dict((k, v) for k, v in items
if basekey == k[:level_idx + 1])
#now I extract the center of all the tiles and make a weighted
#mean of all these center on the area of the tile
#this should give me the (more or less) correct position
#of the center of the category

vals = list(itervalues(subset))
W = sum(w * h for (x, y, w, h) in vals)
x_lab = sum(_get_position(x, w, h, W) for (x, y, w, h) in vals)
y_lab = sum(_get_position(y, h, w, W) for (x, y, w, h) in vals)
#now base on the ordering, select which position to keep
#needs to be written in a more general form of 4 level are enough?
#should give also the horizontal and vertical alignment
side = (level_idx + vertical) % 4
level_ticks[value] = y_lab if side % 2 else x_lab
#now we add the labels of this level to the correct axis

ticks_pos[level_idx](list(itervalues(level_ticks)))
ticks_lab[level_idx](list(iterkeys(level_ticks)),
rotation=rotation[level_idx])
return labels

[docs]def mosaic(data, index=None, ax=None, horizontal=True, gap=0.005,
properties=lambda key: None, labelizer=None,
title='', statistic=False, axes_label=True,
label_rotation=0.0):
"""Create a mosaic plot from a contingency table.

It allows to visualize multivariate categorical data in a rigorous
and informative way.

Parameters
----------
data : dict, pandas.Series, np.ndarray, pandas.DataFrame
The contingency table that contains the data.
Each category should contain a non-negative number
with a tuple as index.  It expects that all the combination
of keys to be representes; if that is not true, will
automatically consider the missing values as 0.  The order
of the keys will be the same as the one of insertion.
If a dict of a Series (or any other dict like object)
is used, it will take the keys as labels.  If a
np.ndarray is provided, it will generate a simple
numerical labels.
index: list, optional
Gives the preferred order for the category ordering. If not specified
will default to the given order.  It doesn't support named indexes
for hierarchical Series.  If a DataFrame is provided, it expects
a list with the name of the columns.
ax : matplotlib.Axes, optional
The graph where display the mosaic. If not given, will
create a new figure
horizontal : bool, optional (default True)
The starting direction of the split (by default along
the horizontal axis)
gap : float or array of floats
The list of gaps to be applied on each subdivision.
If the lenght of the given array is less of the number
of subcategories (or if it's a single number) it will extend
it with exponentially decreasing gaps
labelizer : function (key) -> string, optional
A function that generate the text to display at the center of
each tile base on the key of that tile
properties : function (key) -> dict, optional
A function that for each tile in the mosaic take the key
of the tile and returns the dictionary of properties
of the generated Rectangle, like color, hatch or similar.
A default properties set will be provided fot the keys whose
color has not been defined, and will use color variation to help
visually separates the various categories. It should return None
to indicate that it should use the default property for the tile.
A dictionary of the properties for each key can be passed,
and it will be internally converted to the correct function
statistic: bool, optional (default False)
if true will use a crude statistical model to give colors to the plot.
If the tile has a containt that is more than 2 standard deviation
from the expected value under independence hipotesys, it will
go from green to red (for positive deviations, blue otherwise) and
will acquire an hatching when crosses the 3 sigma.
title: string, optional
The title of the axis
axes_label: boolean, optional
Show the name of each value of each category
on the axis (default) or hide them.
label_rotation: float or list of float
the rotation of the axis label (if present). If a list is given
each axis can have a different rotation

Returns
---------
fig : matplotlib.Figure
The generate figure
rects : dict
A dictionary that has the same keys of the original
dataset, that holds a reference to the coordinates of the
tile and the Rectangle that represent it

References
----------
A Brief History of the Mosaic Display
Michael Friendly, York University, Psychology Department
Journal of Computational and Graphical Statistics, 2001

Mosaic Displays for Loglinear Models.
Michael Friendly, York University, Psychology Department
Proceedings of the Statistical Graphics Section, 1992, 61-68.

Mosaic displays for multi-way contingecy tables.
Michael Friendly, York University, Psychology Department
Journal of the american statistical association
March 1994, Vol. 89, No. 425, Theory and Methods

Examples
----------
>>> import numpy as np
>>> import pandas as pd
>>> import matplotlib.pyplot as plt
>>> from statsmodels.graphics.mosaicplot import mosaic

The most simple use case is to take a dictionary and plot the result

>>> data = {'a': 10, 'b': 15, 'c': 16}
>>> mosaic(data, title='basic dictionary')
>>> plt.show()

A more useful example is given by a dictionary with multiple indices.
In this case we use a wider gap to a better visual separation of the
resulting plot

>>> data = {('a', 'b'): 1, ('a', 'c'): 2, ('d', 'b'): 3, ('d', 'c'): 4}
>>> mosaic(data, gap=0.05, title='complete dictionary')
>>> plt.show()

The same data can be given as a simple or hierarchical indexed Series

>>> rand = np.random.random
>>> from itertools import product
>>> tuples = list(product(['bar', 'baz', 'foo', 'qux'], ['one', 'two']))
>>> index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
>>> data = pd.Series(rand(8), index=index)
>>> mosaic(data, title='hierarchical index series')
>>> plt.show()

The third accepted data structureis the np array, for which a
very simple index will be created.

>>> rand = np.random.random
>>> data = 1+rand((2,2))
>>> mosaic(data, title='random non-labeled array')
>>> plt.show()

If you need to modify the labeling and the coloring you can give
a function tocreate the labels and one with the graphical properties
starting from the key tuple

>>> data = {'a': 10, 'b': 15, 'c': 16}
>>> props = lambda key: {'color': 'r' if 'a' in key else 'gray'}
>>> labelizer = lambda k: {('a',): 'first', ('b',): 'second',
...                        ('c',): 'third'}[k]
>>> mosaic(data, title='colored dictionary', properties=props,
...        labelizer=labelizer)
>>> plt.show()

Using a DataFrame as source, specifying the name of the columns of interest

>>> gender = ['male', 'male', 'male', 'female', 'female', 'female']
>>> pet = ['cat', 'dog', 'dog', 'cat', 'dog', 'cat']
>>> data = pd.DataFrame({'gender': gender, 'pet': pet})
>>> mosaic(data, ['pet', 'gender'], title='DataFrame as Source')
>>> plt.show()

.. plot :: plots/graphics_mosaicplot_mosaic.py
"""
if isinstance(data, DataFrame) and index is None:
raise ValueError("You must pass an index if data is a DataFrame."
" See examples.")

from matplotlib.patches import Rectangle
#from pylab import Rectangle
fig, ax = utils.create_mpl_ax(ax)
# normalize the data to a dict with tuple of strings as keys
data = _normalize_data(data, index)
# split the graph into different areas
rects = _hierarchical_split(data, horizontal=horizontal, gap=gap)
# if there is no specified way to create the labels
# create a default one
if labelizer is None:
labelizer = lambda k: "\n".join(k)
if statistic:
default_props = _statistical_coloring(data)
else:
default_props = _create_default_properties(data)
if isinstance(properties, dict):
color_dict = properties
properties = lambda key: color_dict.get(key, None)
for k, v in iteritems(rects):
# create each rectangle and put a label on it
x, y, w, h = v
conf = properties(k)
props = conf if conf else default_props[k]
text = labelizer(k)
Rect = Rectangle((x, y), w, h, label=text, **props)
ax.text(x + w / 2, y + h / 2, text, ha='center',
va='center', size='smaller')
#creating the labels on the axis
#o clearing it
if axes_label:
if np.iterable(label_rotation):
rotation = label_rotation
else:
rotation = [label_rotation] * 4
labels = _create_labels(rects, horizontal, ax, rotation)
else:
ax.set_xticks([])
ax.set_xticklabels([])
ax.set_yticks([])
ax.set_yticklabels([])
ax.set_title(title)
return fig, rects