3  Data Visualisation with Grammar of Graphics

  1. Open jupyter notebook from the command line:
jupyter notebook

Opening a specific notebook

jupyter notebook notebook.ipynb 
  1. Install following packages
#import sys

#!{sys.executable} -m pip install pandas

#!{sys.executable} -m pip install palmerpenguins

#!{sys.executable} -m pip install plotnine
  1. Load packages
import pandas as pd
import plotnine as p9
from palmerpenguins import load_penguins
from plotnine import *
  1. Explore data
penguins = load_penguins()
penguins.head()
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex year
0 Adelie Torgersen 39.1 18.7 181.0 3750.0 male 2007
1 Adelie Torgersen 39.5 17.4 186.0 3800.0 female 2007
2 Adelie Torgersen 40.3 18.0 195.0 3250.0 female 2007
3 Adelie Torgersen NaN NaN NaN NaN NaN 2007
4 Adelie Torgersen 36.7 19.3 193.0 3450.0 female 2007

Data information

penguins.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
 7   year               344 non-null    int64  
dtypes: float64(4), int64(1), object(3)
memory usage: 21.6+ KB

Descriptive statistics

penguins.describe()
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g year
count 342.000000 342.000000 342.000000 342.000000 344.000000
mean 43.921930 17.151170 200.915205 4201.754386 2008.029070
std 5.459584 1.974793 14.061714 801.954536 0.818356
min 32.100000 13.100000 172.000000 2700.000000 2007.000000
25% 39.225000 15.600000 190.000000 3550.000000 2007.000000
50% 44.450000 17.300000 197.000000 4050.000000 2008.000000
75% 48.500000 18.700000 213.000000 4750.000000 2009.000000
max 59.600000 21.500000 231.000000 6300.000000 2009.000000

3.1 Data visualisation with plotnine

  1. Scatter plot
scatter_plot = (
    ggplot(penguins, aes(x='flipper_length_mm', y='body_mass_g', color='species')) +
    geom_point() +
    labs(title='Flipper Length vs Body Mass')
)
print(scatter_plot)
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/plotnine/layer.py:411: PlotnineWarning: geom_point : Removed 2 rows containing missing values.

scatter_plot = (
    ggplot(penguins, aes(x='flipper_length_mm', y='body_mass_g', color='species')) +
    geom_point() +
    geom_smooth(method='auto', se=False) +  # Add a smooth line without confidence interval
    labs(title='Flipper Length vs Body Mass with Smooth Line')
)
print(scatter_plot)
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/plotnine/layer.py:411: PlotnineWarning: geom_point : Removed 2 rows containing missing values.

3.2 Box plot of flipper length for each species

box_plot = (
    ggplot(penguins, aes(x='species', y='flipper_length_mm')) +
    geom_boxplot() +
    labs(title='Flipper Length by Species')
)
print(box_plot)
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/plotnine/layer.py:333: PlotnineWarning: stat_boxplot : Removed 2 rows containing non-finite values.

3.3 Bar chart of species counts

bar_chart = (
    ggplot(penguins, aes(x='species')) +
    geom_bar() +
    labs(title='Bar Chart of Species Counts')
)
print(bar_chart)

3.4 Histogram

histogram_flipper = (
    ggplot(penguins, aes(x='flipper_length_mm')) +
    geom_histogram(binwidth=5, fill='skyblue', color='black', alpha=0.7) +
    labs(title='Histogram of Flipper Length')
)
print(histogram_flipper)
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/plotnine/layer.py:333: PlotnineWarning: stat_bin : Removed 2 rows containing non-finite values.

3.5 Histogram of flipper length by species

histogram_species = (
    ggplot(penguins, aes(x='flipper_length_mm', fill='species')) +
    geom_histogram(binwidth=5, color='black', alpha=0.7) +
    facet_wrap('~species', scales='free_y') +
    labs(title='Histogram of Flipper Length by Species')
)
print(histogram_species)
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/plotnine/layer.py:333: PlotnineWarning: stat_bin : Removed 2 rows containing non-finite values.
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/plotnine/facets/facet.py:440: PlotnineWarning: If you need more space for the x-axis tick text use ... + theme(subplots_adjust={'wspace': 0.25}). Choose an appropriate value for 'wspace'.

3.6 Density plot

density_plot_flipper = (
    ggplot(penguins, aes(x='flipper_length_mm')) +
    geom_density(fill='skyblue', color='black', alpha=0.7) +
    labs(title='Density Plot of Flipper Length')
)
print(density_plot_flipper)
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/plotnine/layer.py:333: PlotnineWarning: stat_density : Removed 2 rows containing non-finite values.

3.7 Histogram of flipper length by species with additional customization

histogram_species_custom1 = (
    ggplot(penguins, aes(x='flipper_length_mm', fill='species')) +
    geom_histogram(binwidth=5, color='black', alpha=0.7) +
    facet_wrap('~species', scales='free_y') +
    labs(
        title='Histogram of Flipper Length by Species',
        x='Flipper Length (mm)',  # X-axis title
        y='Frequency',            # Y-axis title
        caption='Source: Palmer Penguins Dataset',
    ) )
print(histogram_species_custom1)
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/plotnine/layer.py:333: PlotnineWarning: stat_bin : Removed 2 rows containing non-finite values.
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/plotnine/facets/facet.py:440: PlotnineWarning: If you need more space for the x-axis tick text use ... + theme(subplots_adjust={'wspace': 0.25}). Choose an appropriate value for 'wspace'.

3.8 Change themes

histogram_species_custom2 = (
    ggplot(penguins, aes(x='flipper_length_mm', fill='species')) +
    geom_histogram(binwidth=5, color='black', alpha=0.7) +
    facet_wrap('~species', scales='free_y') +
    labs(
        title='Histogram of Flipper Length by Species',
        x='Flipper Length (mm)',  # X-axis title
        y='Frequency',            # Y-axis title
        caption='Source: Palmer Penguins Dataset'
    ) +
    theme(
        axis_text_x=element_text(angle=45, ha='right'),  # Rotate x-axis labels for better readability
        axis_ticks_major_x=element_blank(),  # Hide major x-axis ticks
        axis_ticks_minor_x=element_blank(),  # Hide minor x-axis ticks
        panel_grid_major_x=element_blank(),  # Hide major x-grid lines
        panel_grid_minor_x=element_blank(),  # Hide minor x-grid lines
    )
)
print(histogram_species_custom2)
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/plotnine/layer.py:333: PlotnineWarning: stat_bin : Removed 2 rows containing non-finite values.
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/plotnine/facets/facet.py:440: PlotnineWarning: If you need more space for the x-axis tick text use ... + theme(subplots_adjust={'wspace': 0.25}). Choose an appropriate value for 'wspace'.

3.9 Histogram of flipper length by species with dark theme

histogram_species_dark = (
    ggplot(penguins, aes(x='flipper_length_mm', fill='species')) +
    geom_histogram(binwidth=5, color='black', alpha=0.7) +
    facet_wrap('~species', scales='free_y') +
    ggtitle('Histogram of Flipper Length by Species\nDistribution of flipper length for different penguin species') +
    labs(
        x='Flipper Length (mm)',  # X-axis title
        y='Frequency',            # Y-axis title
        caption='Source: Palmer Penguins Dataset',
    ) +
    theme_dark()  # Apply dark theme
)
print(histogram_species_dark)
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/plotnine/layer.py:333: PlotnineWarning: stat_bin : Removed 2 rows containing non-finite values.
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/plotnine/facets/facet.py:440: PlotnineWarning: If you need more space for the x-axis tick text use ... + theme(subplots_adjust={'wspace': 0.25}). Choose an appropriate value for 'wspace'.