接触过R中的ggplot绘图的伙伴应该被其优雅的绘图所吸引,那么现在大家基本都用python来进行数据处理,在python中也有许多绘图库,除了我们熟悉的matplotlib之外,今天给大家介绍一个拥有ggplot一样绘图美学的python绘图库plotnine。plotnine提供各种不同的可视化,易于适应定制输出。如果你之前接触过R中的ggplot,那么使用plotnine将毫不费力。plotnine安装十分简单,可用pip或者conda直接安装:
# Using pip
$ pip install plotnine # 1. should be sufficient for most
$ pip install 'plotnine[all]' # 2. includes extra/optional packages
# Or using conda
$ conda install -c conda-forge plotnine
本次数据集我们使用的是包含有关2,410种美国精酿啤酒的信息数据,大家不需要去下载该数据集,可直接通过下文提供的地址用pandas加载就可以了。
导入库及定义变量
安装好plotnine库之后,在绘图前定义后续使用的一些变量函数。
import pandas as pd
import numpy as np
from plotnine import *
#数据地址
c_remote_data ='https://raw.githubusercontent.com/nickhould/craft-beers-dataset/master/data/processed/beers.csv'
#定义图表颜色
c_col = ["#2f4858", "#f6ae2d", "#f26419",
"#33658a", "#55dde0", "#2f4858",
"#2f4858", "#f6ae2d", "#f26419",
"#33658a", "#55dde0", "#2f4858"]
def labels(from_, to_, step_):
return pd.Series(np.arange(from_, to_ + step_, step_)).apply(lambda x: '{:,}'.format(x)).tolist()
def breaks(from_, to_, step_):
return pd.Series(np.arange(from_, to_ + step_, step_)).tolist()
加载数据并索引相应列
这里用pandas直接通过数据地址加载数据,并用filter索引相应的数据列。
data = pd.read_csv(c_remote_data)
data = (
data.filter([
'abv',
'ibu',
'id',
'name',
'style',
'brewery_id',
'ounces'
]).
set_index('id')
)
直方图
基本直方图
fig =(
ggplot(data.dropna(subset = ['abv']))+
geom_histogram(aes(x ='abv'))
)
直方图中添加颜色
fig =(
ggplot(data.dropna(subset = ['abv']))+
geom_histogram(
aes(x ='abv'),
fill = c_col[0],color ='black'))
添加坐标名称
fig = (
ggplot(data.dropna(subset = ['abv'])) +
geom_histogram(
aes(x = 'abv'),
fill = c_col[0], color = 'black'
) +
labs(
title ='Distribution of The alcoholic content by volume (abv)',
x = 'abv - The alcoholic content by volume',
y = 'Count',))
显示完整坐标刻度
fig = (
ggplot(data.dropna(subset = ['abv'])) +
geom_histogram(
aes(x = 'abv'),
fill = c_col[0], color = 'black'
) +
labs(
title ='Distribution of The alcoholic content by volume (abv)',
x = 'abv - The alcoholic content by volume',
y = 'Count',
) +
scale_x_continuous(
limits = (0, 0.14),
labels = labels(0, 0.14, 0.02),
breaks = breaks(0, 0.14, 0.02)
) +
scale_y_continuous(
limits = (0, 350),
labels = labels(0, 350, 50),
breaks = breaks(0, 350, 50)))
散点图
fig = (
ggplot(data.dropna(subset = ['abv'])) +
geom_point(
aes(x = 'abv',
y = 'ibu'),
fill = c_col[0], color = 'black'
) +
labs(
title ='Relationship between alcoholic content (abv) and int. bittering untis (ibu)',
x = 'abv - The alcoholic content by volume',
y = 'ibu - International bittering units',
) +
scale_x_continuous(
limits = (0, 0.14),
labels = labels(0, 0.14, 0.02),
breaks = breaks(0, 0.14, 0.02)
) +
scale_y_continuous(
limits = (0, 150),
labels = labels(0, 150, 30),
breaks = breaks(0, 150, 30)))
对不同数据点设置不同颜色
data['ounces_str'] = data['ounces']
data['ounces_str'] = data['ounces_str'].apply(str)
fig = (
ggplot(data.dropna(subset = ['abv'])) +
geom_point(
aes(x = 'abv',
y = 'ibu',
fill = 'ounces_str'),
alpha = 0.5,
color = 'black'
) +
labs(
title ='Relationship between alcoholic content (abv) and int. bittering untis (ibu)',
x = 'abv - The alcoholic content by volume',
y = 'ibu - International bittering units',
) +
scale_fill_manual(
name = 'Ounces',
values = c_col) +
scale_x_continuous(
limits = (0, 0.14),
labels = labels(0, 0.14, 0.02),
breaks = breaks(0, 0.14, 0.02)
) +
scale_y_continuous(
limits = (0, 150),
labels = labels(0, 150, 30),
breaks = breaks(0, 150, 30)))
热图
fig = (
ggplot(data.dropna(subset = ['abv'])) +
geom_bin2d(
aes(x = 'abv',
y = 'ibu')
) +
labs(
title ='Relationship between alcoholic content (abv) and int. bittering untis (ibu)',
x = 'abv - The alcoholic content by volume',
y = 'ibu - International bittering units',
) +
scale_x_continuous(
limits = (0, 0.14),
labels = labels(0, 0.14, 0.02),
breaks = breaks(0, 0.14, 0.02)
) +
scale_y_continuous(
limits = (0, 150),
labels = labels(0, 150, 30),
breaks = breaks(0, 150, 30)
) +
theme(figure_size = (8, 8)))
箱图
fig = (
ggplot(data.dropna(subset = ['abv'])) +
geom_boxplot(
aes(x = 'ounces_str',
y = 'abv')
) +
labs(
title ='Distribution of alcoholic content (abv) by size',
x = 'size in ounces',
y = 'abv - The alcoholic content by volume',
) +
scale_y_continuous(
limits = (0, 0.14),
labels = labels(0, 0.14, 0.02),
breaks = breaks(0, 0.14, 0.02)))
花式箱图
fig = (
ggplot(data.dropna(subset = ['abv'])) +
geom_violin(
aes(x = 'ounces_str',
y = 'abv'),
fill = c_col[0]
) +
labs(
title ='Distribution of alcoholic content (abv) by size',
x = 'size in ounces',
y = 'abv - The alcoholic content by volume',
) +
scale_y_continuous(
limits = (0, 0.14),
labels = labels(0, 0.14, 0.02),
breaks = breaks(0, 0.14, 0.02)))