Case study: heat map#

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
DATA_URL = 'https://raw.githubusercontent.com/health-data-science-OR/' \
            + 'hpdm139-datasets/main/wisconsin.zip'
def load_wisconsin(path):
    '''
    Load and clean the wisconsin breast cancer 
    dataset.
    '''
    recoded_label = {'diagnosis': {'M':'1',
                                   'B':'0'}}
    
    to_drop = ['Unnamed: 0', 'id']
    df = (pd.read_csv(path)
            .drop(to_drop, axis=1) 
            .replace(recoded_label)
            .astype({'diagnosis': np.byte})
         )
    return df[df.columns[1:]], df['diagnosis']
X, y = load_wisconsin(DATA_URL)
X.head().head(2)
radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean fractal_dimension_mean ... radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
0 17.99 10.38 122.8 1001.0 0.11840 0.27760 0.3001 0.14710 0.2419 0.07871 ... 25.38 17.33 184.6 2019.0 0.1622 0.6656 0.7119 0.2654 0.4601 0.11890
1 20.57 17.77 132.9 1326.0 0.08474 0.07864 0.0869 0.07017 0.1812 0.05667 ... 24.99 23.41 158.8 1956.0 0.1238 0.1866 0.2416 0.1860 0.2750 0.08902

2 rows × 30 columns

Visualising Correlations with a heatmap#

def correlation_heat_map(correlation_matrix, cmap='inferno_r', figsize=(8,8)):
    '''
    Correlation heat map
    
    Params:
    -------
    correlation_matrix: pd.DataFrame,
        n x n matrix with feature names
        
    cmap: str, optional (default='inferno_r')
        The colour mapping for the heatmap
        
    figsize: tuple (int, int)
        Size of figure.
        
    Returns:
    -------
    tuple (figure, image)
    '''
    fig = plt.figure(figsize=figsize)
    ax = fig.add_subplot()
    im = ax.imshow(correlation_matrix, cmap=cmap)

    # Make sure there are enough visible tick marks
    ax.set_xticks(ticks=np.arange(len(correlation_matrix.columns)))
    ax.set_yticks(ticks=np.arange(len(correlation_matrix.columns)))

    # add the feature names and rotate x axis 90 degrees.
    ax.set_xticklabels(list(correlation_matrix.columns), rotation=90)
    ax.set_yticklabels(list(correlation_matrix.index))

    # add colour coding legend
    cbar = fig.colorbar(im, ax=ax)
    cbar.ax.set_ylabel('Correlation', rotation=-90, va="bottom");
    
    return fig, im
correlation_matrix = X.corr()
fig, im = correlation_heat_map(correlation_matrix = correlation_matrix)
../../../_images/4c447cbfe72cf887af76bbd93f4a4aaa64963ccaa9e871797cc31627a10448ca.png