#import sys
#!{sys.executable} -m pip install pandas
#!{sys.executable} -m pip install palmerpenguins
#!{sys.executable} -m pip install plotnine
3 Data Visualisation with Grammar of Graphics
- Open jupyter notebook from the command line:
jupyter notebook
Opening a specific notebook
jupyter notebook notebook.ipynb
- Install following packages
- Load packages
import pandas as pd
import plotnine as p9
from palmerpenguins import load_penguins
from plotnine import *
- Explore data
= load_penguins()
penguins penguins.head()
species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | year | |
---|---|---|---|---|---|---|---|---|
0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | male | 2007 |
1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | female | 2007 |
2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | female | 2007 |
3 | Adelie | Torgersen | NaN | NaN | NaN | NaN | NaN | 2007 |
4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | female | 2007 |
Data information
penguins.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 species 344 non-null object
1 island 344 non-null object
2 bill_length_mm 342 non-null float64
3 bill_depth_mm 342 non-null float64
4 flipper_length_mm 342 non-null float64
5 body_mass_g 342 non-null float64
6 sex 333 non-null object
7 year 344 non-null int64
dtypes: float64(4), int64(1), object(3)
memory usage: 21.6+ KB
Descriptive statistics
penguins.describe()
bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | year | |
---|---|---|---|---|---|
count | 342.000000 | 342.000000 | 342.000000 | 342.000000 | 344.000000 |
mean | 43.921930 | 17.151170 | 200.915205 | 4201.754386 | 2008.029070 |
std | 5.459584 | 1.974793 | 14.061714 | 801.954536 | 0.818356 |
min | 32.100000 | 13.100000 | 172.000000 | 2700.000000 | 2007.000000 |
25% | 39.225000 | 15.600000 | 190.000000 | 3550.000000 | 2007.000000 |
50% | 44.450000 | 17.300000 | 197.000000 | 4050.000000 | 2008.000000 |
75% | 48.500000 | 18.700000 | 213.000000 | 4750.000000 | 2009.000000 |
max | 59.600000 | 21.500000 | 231.000000 | 6300.000000 | 2009.000000 |
3.1 Data visualisation with plotnine
- Scatter plot
= (
scatter_plot ='flipper_length_mm', y='body_mass_g', color='species')) +
ggplot(penguins, aes(x+
geom_point() ='Flipper Length vs Body Mass')
labs(title
)print(scatter_plot)
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/plotnine/layer.py:411: PlotnineWarning: geom_point : Removed 2 rows containing missing values.
= (
scatter_plot ='flipper_length_mm', y='body_mass_g', color='species')) +
ggplot(penguins, aes(x+
geom_point() ='auto', se=False) + # Add a smooth line without confidence interval
geom_smooth(method='Flipper Length vs Body Mass with Smooth Line')
labs(title
)print(scatter_plot)
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/plotnine/layer.py:411: PlotnineWarning: geom_point : Removed 2 rows containing missing values.
3.2 Box plot of flipper length for each species
= (
box_plot ='species', y='flipper_length_mm')) +
ggplot(penguins, aes(x+
geom_boxplot() ='Flipper Length by Species')
labs(title
)print(box_plot)
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/plotnine/layer.py:333: PlotnineWarning: stat_boxplot : Removed 2 rows containing non-finite values.
3.3 Bar chart of species counts
= (
bar_chart ='species')) +
ggplot(penguins, aes(x+
geom_bar() ='Bar Chart of Species Counts')
labs(title
)print(bar_chart)
3.4 Histogram
= (
histogram_flipper ='flipper_length_mm')) +
ggplot(penguins, aes(x=5, fill='skyblue', color='black', alpha=0.7) +
geom_histogram(binwidth='Histogram of Flipper Length')
labs(title
)print(histogram_flipper)
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/plotnine/layer.py:333: PlotnineWarning: stat_bin : Removed 2 rows containing non-finite values.
3.5 Histogram of flipper length by species
= (
histogram_species ='flipper_length_mm', fill='species')) +
ggplot(penguins, aes(x=5, color='black', alpha=0.7) +
geom_histogram(binwidth'~species', scales='free_y') +
facet_wrap(='Histogram of Flipper Length by Species')
labs(title
)print(histogram_species)
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/plotnine/layer.py:333: PlotnineWarning: stat_bin : Removed 2 rows containing non-finite values.
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/plotnine/facets/facet.py:440: PlotnineWarning: If you need more space for the x-axis tick text use ... + theme(subplots_adjust={'wspace': 0.25}). Choose an appropriate value for 'wspace'.
3.6 Density plot
= (
density_plot_flipper ='flipper_length_mm')) +
ggplot(penguins, aes(x='skyblue', color='black', alpha=0.7) +
geom_density(fill='Density Plot of Flipper Length')
labs(title
)print(density_plot_flipper)
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/plotnine/layer.py:333: PlotnineWarning: stat_density : Removed 2 rows containing non-finite values.
3.7 Histogram of flipper length by species with additional customization
= (
histogram_species_custom1 ='flipper_length_mm', fill='species')) +
ggplot(penguins, aes(x=5, color='black', alpha=0.7) +
geom_histogram(binwidth'~species', scales='free_y') +
facet_wrap(
labs(='Histogram of Flipper Length by Species',
title='Flipper Length (mm)', # X-axis title
x='Frequency', # Y-axis title
y='Source: Palmer Penguins Dataset',
caption
) )print(histogram_species_custom1)
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/plotnine/layer.py:333: PlotnineWarning: stat_bin : Removed 2 rows containing non-finite values.
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/plotnine/facets/facet.py:440: PlotnineWarning: If you need more space for the x-axis tick text use ... + theme(subplots_adjust={'wspace': 0.25}). Choose an appropriate value for 'wspace'.
3.8 Change themes
= (
histogram_species_custom2 ='flipper_length_mm', fill='species')) +
ggplot(penguins, aes(x=5, color='black', alpha=0.7) +
geom_histogram(binwidth'~species', scales='free_y') +
facet_wrap(
labs(='Histogram of Flipper Length by Species',
title='Flipper Length (mm)', # X-axis title
x='Frequency', # Y-axis title
y='Source: Palmer Penguins Dataset'
caption+
)
theme(=element_text(angle=45, ha='right'), # Rotate x-axis labels for better readability
axis_text_x=element_blank(), # Hide major x-axis ticks
axis_ticks_major_x=element_blank(), # Hide minor x-axis ticks
axis_ticks_minor_x=element_blank(), # Hide major x-grid lines
panel_grid_major_x=element_blank(), # Hide minor x-grid lines
panel_grid_minor_x
)
)print(histogram_species_custom2)
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/plotnine/layer.py:333: PlotnineWarning: stat_bin : Removed 2 rows containing non-finite values.
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/plotnine/facets/facet.py:440: PlotnineWarning: If you need more space for the x-axis tick text use ... + theme(subplots_adjust={'wspace': 0.25}). Choose an appropriate value for 'wspace'.
3.9 Histogram of flipper length by species with dark theme
= (
histogram_species_dark ='flipper_length_mm', fill='species')) +
ggplot(penguins, aes(x=5, color='black', alpha=0.7) +
geom_histogram(binwidth'~species', scales='free_y') +
facet_wrap('Histogram of Flipper Length by Species\nDistribution of flipper length for different penguin species') +
ggtitle(
labs(='Flipper Length (mm)', # X-axis title
x='Frequency', # Y-axis title
y='Source: Palmer Penguins Dataset',
caption+
) # Apply dark theme
theme_dark()
)print(histogram_species_dark)
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/plotnine/layer.py:333: PlotnineWarning: stat_bin : Removed 2 rows containing non-finite values.
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/plotnine/facets/facet.py:440: PlotnineWarning: If you need more space for the x-axis tick text use ... + theme(subplots_adjust={'wspace': 0.25}). Choose an appropriate value for 'wspace'.