Python数据可视化(一):散点图绘制

输入数据格式

image.png

使用matplotlib包绘制散点图

# 导入所需的python包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 设置绘图格式
plt.style.use('seaborn')
%matplotlib inline

# 创建示例数据集
df=pd.DataFrame({'x': range(1,101), 'y': np.random.randn(100)*15+range(1,101) })

#  查看示例数据头和尾的各5行
df.head(5).append(df.tail(5))
x y
0 1 7.821203
1 2 8.372683
2 3 10.616092
3 4 -0.183374
4 5 18.387730
95 96 101.110453
96 97 102.630476
97 98 90.080476
98 99 121.161754
99 100 78.376947
# 绘制基础散点图
plt.plot( 'x', 'y', data=df, linestyle='none', marker='o')
plt.show()
image.png

设置点的形状

# marker参数设置点的形状

# === first figure:
plt.plot( 'x', 'y', data=df, linestyle='none', marker='*')
plt.show()

# === second figure:
# 所有点的类型
all_poss=['.','o','v','^','>','<','s','p','*','h','H','D','d','1','','']

# to see all possibilities:
# markers.MarkerStyle.markers.keys()

# set the limit of x and y axis:
# 设置x和y轴的范围
plt.xlim(0.5,4.5)
plt.ylim(0.5,4.5)

# remove ticks and values of axis:
# 去除x和y轴的刻度
plt.xticks([])
plt.yticks([])
#plt.set_xlabel(size=0)

# Make a loop to add markers one by one
num=0
for x in range(1,5):
  for y in range(1,5):
    num += 1
    plt.plot(x,y,marker=all_poss[num-1], markerfacecolor='orange', markersize=23, markeredgecolor="black")
    # add text annotation
    plt.text(x+0.2, y, all_poss[num-1], horizontalalignment='left', size='medium', color='black', weight='semibold')
image.png
image.png

设置点的大小

# markersize参数设置点的大小
plt.plot( 'x', 'y', data=df, linestyle='none', marker='D', markersize=16)
plt.show()
image.png

设置点的颜色

# markerfacecolor参数设置点的颜色,markeredgecolor参数设置点边框的颜色, markeredgewidth参数设置点边框的宽度
plt.plot( 'x', 'y', data=df, linestyle='none', marker="o", markersize=16, markerfacecolor='skyblue', markeredgecolor="black")
plt.show()

plt.plot( 'x', 'y', data=df, linestyle='none', marker='D', markersize=16, markeredgecolor="orange", markeredgewidth=5)
plt.show()
image.png
image.png
# 添加连接线,linestyle参数设置线的类型

plt.plot( 'x', 'y', data=df, linestyle='-', marker='o')
plt.show()
image.png

添加注释信息

# Basic chart
df=pd.DataFrame({'x': range(1,101), 'y': np.random.randn(100)*15+range(1,101) })
plt.plot( 'x', 'y', data=df, linestyle='none', marker='o')

# 添加文本注释和箭头
# Annotate with text + Arrow
plt.annotate(
# Label and coordinate
'This point is interesting!', xy=(25, 50), xytext=(0, 80),

# Custom arrow
arrowprops=dict(facecolor='black', shrink=0.05)
)
image.png
# plot
df=pd.DataFrame({'x': range(1,101), 'y': np.random.randn(100)*15+range(1,101) })
plt.plot( 'x', 'y', data=df, linestyle='none', marker='o')

# Annotation
plt.text(40, 00, r'equation: $\sum_{i=0}^\infty x_i/pre>, fontsize=20)
image.png
# Plot
df=pd.DataFrame({'x': range(1,101), 'y': np.random.randn(100)*15+range(1,101) })
plt.plot( 'x', 'y', data=df, linestyle='none', marker='o')

# Annotation
# 添加垂直线
plt.axvline(40, color='r')
# 添加水平线
plt.axhline(40, color='green')
image.png
# libraries
import matplotlib.patches as patches

# Plot
fig1 = plt.figure()
ax1 = fig1.add_subplot(111)
ax1.plot( 'x', 'y', data=df, linestyle='none', marker='o')

# Add rectangle
# 添加矩形区
ax1.add_patch(
    patches.Rectangle(
        (20, 25), # (x,y)
        50, # width
        50, # height
        # You can add rotation as well with 'angle'
        alpha=0.3, facecolor="red", edgecolor="black", linewidth=3, linestyle='solid'
    )
)
image.png
# Plot
fig1 = plt.figure()
ax1 = fig1.add_subplot(111)
ax1.plot( 'x', 'y', data=df, linestyle='none', marker='o')

# Annotation
# 添加圆圈区
ax1.add_patch(
    patches.Circle(
        (40, 35),           # (x,y)
        30,                    # radius
        alpha=0.3, facecolor="green", edgecolor="black", linewidth=1, linestyle='solid'
    )
)
image.png

避免点的重叠

# Dataset:
# 构建示例数据集
df=pd.DataFrame({'x': np.random.normal(10, 1.2, 20000), 'y': np.random.normal(10, 1.2, 20000), 'group': np.repeat('A',20000) })
tmp1=pd.DataFrame({'x': np.random.normal(14.5, 1.2, 20000), 'y': np.random.normal(14.5, 1.2, 20000), 'group': np.repeat('B',20000) })
tmp2=pd.DataFrame({'x': np.random.normal(9.5, 1.5, 20000), 'y': np.random.normal(15.5, 1.5, 20000), 'group': np.repeat('C',20000) })
df=df.append(tmp1).append(tmp2)
df.head(10)
x y group
0 11.529794 11.000711 A
1 10.524043 11.541500 A
2 9.845806 9.156706 A
3 10.970836 9.428074 A
4 10.748096 12.098970 A
5 9.455139 8.636227 A
6 8.094581 8.518158 A
7 10.259945 9.168257 A
8 9.420490 10.227326 A
9 7.124481 9.170850 A
# plot
plt.plot( 'x', 'y', data=df, linestyle='', marker='o')

# 设置x轴标签
plt.xlabel('Value of X')

# 设置y轴标签
plt.ylabel('Value of Y')

# 设置标题
plt.title('Overplotting looks like that:', loc='left')
image.png
# 更改点的大小
# Plot with small marker size
plt.plot( 'x', 'y', data=df, linestyle='', marker='o', markersize=0.7)
plt.xlabel('Value of X')
plt.ylabel('Value of Y')
plt.title('Overplotting? Try to reduce the dot size', loc='left')
image.png
# 设置点的透明度
# Plot with transparency
plt.plot( 'x', 'y', data=df, linestyle='', marker='o', markersize=3, alpha=0.05, color="red")

# Titles
plt.xlabel('Value of X')
plt.ylabel('Value of Y')
plt.title('Overplotting? Try to use transparency', loc='left')
image.png
# 随机取样
# Sample 1000 random lines
# 随机取100行数据
df_sample=df.sample(1000)

# Make the plot with this subset
plt.plot( 'x', 'y', data=df_sample, linestyle='', marker='o')

# titles
plt.xlabel('Value of X')
plt.ylabel('Value of Y')
plt.title('Overplotting? Sample your data', loc='left')
image.png

使用seaborn包绘制散点图

# library & dataset
import seaborn as sns

# 加载内置数据集
df = sns.load_dataset('iris')

# 查看示例数据
df.head(5).append(df.tail(5))
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
145 6.7 3.0 5.2 2.3 virginica
146 6.3 2.5 5.0 1.9 virginica
147 6.5 3.0 5.2 2.0 virginica
148 6.2 3.4 5.4 2.3 virginica
149 5.9 3.0 5.1 1.8 virginica

使用regplot函数绘制散点图

# 查看regplot的用法
help(sns.regplot)

regplot(x, y, data=None, x_estimator=None, x_bins=None, x_ci='ci', scatter=True, fit_reg=True, ci=95, n_boot=1000, units=None, order=1, logistic=False, lowess=False, robust=False, logx=False, x_partial=None, y_partial=None, truncate=False, dropna=True, x_jitter=None, y_jitter=None, label=None, color=None, marker='o', scatter_kws=None, line_kws=None, ax=None)
    Plot data and a linear regression model fit.

    There are a number of mutually exclusive options for estimating the
    regression model. See the :ref:`tutorial <regression_tutorial>` for more
    information.    

    Parameters
    ----------
    x, y: string, series, or vector array
        Input variables. If strings, these should correspond with column names
        in ``data``. When pandas objects are used, axes will be labeled with
        the series name.
    data : DataFrame
        Tidy ("long-form") dataframe where each column is a variable and each
        row is an observation.    
    x_estimator : callable that maps vector -> scalar, optional
        Apply this function to each unique value of ``x`` and plot the
        resulting estimate. This is useful when ``x`` is a discrete variable.
        If ``x_ci`` is given, this estimate will be bootstrapped and a
        confidence interval will be drawn.    
    x_bins : int or vector, optional
        Bin the ``x`` variable into discrete bins and then estimate the central
        tendency and a confidence interval. This binning only influences how
        the scatterplot is drawn; the regression is still fit to the original
        data.  This parameter is interpreted either as the number of
        evenly-sized (not necessary spaced) bins or the positions of the bin
        centers. When this parameter is used, it implies that the default of
        ``x_estimator`` is ``numpy.mean``.    
    x_ci : "ci", "sd", int in [0, 100] or None, optional
        Size of the confidence interval used when plotting a central tendency
        for discrete values of ``x``. If ``"ci"``, defer to the value of the
        ``ci`` parameter. If ``"sd"``, skip bootstrapping and show the
        standard deviation of the observations in each bin.    
    scatter : bool, optional
        If ``True``, draw a scatterplot with the underlying observations (or
        the ``x_estimator`` values).    
    fit_reg : bool, optional
        If ``True``, estimate and plot a regression model relating the ``x``
        and ``y`` variables.    
    ci : int in [0, 100] or None, optional
        Size of the confidence interval for the regression estimate. This will
        be drawn using translucent bands around the regression line. The
        confidence interval is estimated using a bootstrap; for large
        datasets, it may be advisable to avoid that computation by setting
        this parameter to None.    
    n_boot : int, optional
        Number of bootstrap resamples used to estimate the ``ci``. The default
        value attempts to balance time and stability; you may want to increase
        this value for "final" versions of plots.    
    units : variable name in ``data``, optional
        If the ``x`` and ``y`` observations are nested within sampling units,
        those can be specified here. This will be taken into account when
        computing the confidence intervals by performing a multilevel bootstrap
        that resamples both units and observations (within unit). This does not
        otherwise influence how the regression is estimated or drawn.    
    order : int, optional
        If ``order`` is greater than 1, use ``numpy.polyfit`` to estimate a
        polynomial regression.    
    logistic : bool, optional
        If ``True``, assume that ``y`` is a binary variable and use
        ``statsmodels`` to estimate a logistic regression model. Note that this
        is substantially more computationally intensive than linear regression,
        so you may wish to decrease the number of bootstrap resamples
        (``n_boot``) or set ``ci`` to None.    
    lowess : bool, optional
        If ``True``, use ``statsmodels`` to estimate a nonparametric lowess
        model (locally weighted linear regression). Note that confidence
        intervals cannot currently be drawn for this kind of model.    
    robust : bool, optional
        If ``True``, use ``statsmodels`` to estimate a robust regression. This
        will de-weight outliers. Note that this is substantially more
        computationally intensive than standard linear regression, so you may
        wish to decrease the number of bootstrap resamples (``n_boot``) or set
        ``ci`` to None.    
    logx : bool, optional
        If ``True``, estimate a linear regression of the form y ~ log(x), but
        plot the scatterplot and regression model in the input space. Note that
        ``x`` must be positive for this to work.    
    {x,y}_partial : strings in ``data`` or matrices
        Confounding variables to regress out of the ``x`` or ``y`` variables
        before plotting.    
    truncate : bool, optional
        By default, the regression line is drawn to fill the x axis limits
        after the scatterplot is drawn. If ``truncate`` is ``True``, it will
        instead by bounded by the data limits.    
    {x,y}_jitter : floats, optional
        Add uniform random noise of this size to either the ``x`` or ``y``
        variables. The noise is added to a copy of the data after fitting the
        regression, and only influences the look of the scatterplot. This can
        be helpful when plotting variables that take discrete values.    
    label : string
        Label to apply to ether the scatterplot or regression line (if
        ``scatter`` is ``False``) for use in a legend.
    color : matplotlib color
        Color to apply to all plot elements; will be superseded by colors
        passed in ``scatter_kws`` or ``line_kws``.
    marker : matplotlib marker code
        Marker to use for the scatterplot glyphs.
    {scatter,line}_kws : dictionaries
        Additional keyword arguments to pass to ``plt.scatter`` and
        ``plt.plot``.    
    ax : matplotlib Axes, optional
        Axes object to draw the plot onto, otherwise uses the current Axes.

    Returns
    -------
    ax : matplotlib Axes
        The Axes object containing the plot.

    See Also
    --------
    lmplot : Combine :func:`regplot` and :class:`FacetGrid` to plot multiple
             linear relationships in a dataset.
    jointplot : Combine :func:`regplot` and :class:`JointGrid` (when used with
                ``kind="reg"``).
    pairplot : Combine :func:`regplot` and :class:`PairGrid` (when used with
               ``kind="reg"``).
    residplot : Plot the residuals of a linear regression model.

    Notes
    -----

    The :func:`regplot` and :func:`lmplot` functions are closely related, but
    the former is an axes-level function while the latter is a figure-level
    function that combines :func:`regplot` and :class:`FacetGrid`.    

    It's also easy to combine combine :func:`regplot` and :class:`JointGrid` or
    :class:`PairGrid` through the :func:`jointplot` and :func:`pairplot`
    functions, although these do not directly accept all of :func:`regplot`'s
    parameters.

    Examples
    --------

    Plot the relationship between two variables in a DataFrame:

    .. plot::
        :context: close-figs

        >>> import seaborn as sns; sns.set(color_codes=True)
        >>> tips = sns.load_dataset("tips")
        >>> ax = sns.regplot(x="total_bill", y="tip", data=tips)

    Plot with two variables defined as numpy arrays; use a different color:

    .. plot::
        :context: close-figs

        >>> import numpy as np; np.random.seed(8)
        >>> mean, cov = [4, 6], [(1.5, .7), (.7, 1)]
        >>> x, y = np.random.multivariate_normal(mean, cov, 80).T
        >>> ax = sns.regplot(x=x, y=y, color="g")

    Plot with two variables defined as pandas Series; use a different marker:

    .. plot::
        :context: close-figs

        >>> import pandas as pd
        >>> x, y = pd.Series(x, name="x_var"), pd.Series(y, name="y_var")
        >>> ax = sns.regplot(x=x, y=y, marker="+")

    Use a 68% confidence interval, which corresponds with the standard error
    of the estimate:

    .. plot::
        :context: close-figs

        >>> ax = sns.regplot(x=x, y=y, ci=68)

    Plot with a discrete ``x`` variable and add some jitter:

    .. plot::
        :context: close-figs

        >>> ax = sns.regplot(x="size", y="total_bill", data=tips, x_jitter=.1)

    Plot with a discrete ``x`` variable showing means and confidence intervals
    for unique values:

    .. plot::
        :context: close-figs

        >>> ax = sns.regplot(x="size", y="total_bill", data=tips,
        ...                  x_estimator=np.mean)

    Plot with a continuous variable divided into discrete bins:

    .. plot::
        :context: close-figs

        >>> ax = sns.regplot(x=x, y=y, x_bins=4)

    Fit a higher-order polynomial regression and truncate the model prediction:

    .. plot::
        :context: close-figs

        >>> ans = sns.load_dataset("anscombe")
        >>> ax = sns.regplot(x="x", y="y", data=ans.loc[ans.dataset == "II"],
        ...                  scatter_kws={"s": 80},
        ...                  order=2, ci=None, truncate=True)

    Fit a robust regression and don't plot a confidence interval:

    .. plot::
        :context: close-figs

        >>> ax = sns.regplot(x="x", y="y", data=ans.loc[ans.dataset == "III"],
        ...                  scatter_kws={"s": 80},
        ...                  robust=True, ci=None)

    Fit a logistic regression; jitter the y variable and use fewer bootstrap
    iterations:

    .. plot::
        :context: close-figs

        >>> tips["big_tip"] = (tips.tip / tips.total_bill) > .175
        >>> ax = sns.regplot(x="total_bill", y="big_tip", data=tips,
        ...                  logistic=True, n_boot=500, y_jitter=.03)

    Fit the regression model using log(x) and truncate the model prediction:

    .. plot::
        :context: close-figs

        >>> ax = sns.regplot(x="size", y="total_bill", data=tips,
        ...                  x_estimator=np.mean, logx=True, truncate=True)

# 使用regplot函数绘制散点图
sns.regplot(x=df["sepal_length"], y=df["sepal_width"])
#sns.plt.show()
image.png
# Without regression fit:
# 去掉回归线
sns.regplot(x=df["sepal_length"], y=df["sepal_width"], fit_reg=False)
#sns.plt.show()
image.png
# Change shape of marker
# marker参数设置点的形状
sns.regplot(x=df["sepal_length"], y=df["sepal_width"], marker="+", fit_reg=False)
#sns.plt.show()
image.png
# More marker customization:
# 使用scatter_kws参数设置点的颜色、透明度和大小
sns.regplot(x=df["sepal_length"], y=df["sepal_width"], fit_reg=False, scatter_kws={"color":"darkred","alpha":0.3,"s":200} )
#sns.plt.show()
image.png

使用lmplot函数绘制散点图

# Use the 'hue' argument to provide a factor variable
# hue参数设置分类变量颜色
sns.lmplot( x="sepal_length", y="sepal_width", data=df, fit_reg=False, hue='species', legend=False)

# Move the legend to an empty part of the plot
plt.legend(loc='lower right')

#sns.plt.show()
image.png
# give a list to the marker argument
# markers参数设置点的形状
sns.lmplot( x="sepal_length", y="sepal_width", data=df, fit_reg=False, hue='species', legend=False, markers=["o", "x", "1"])

# Move the legend to an empty part of the plot
plt.legend(loc='lower right')

#sns.plt.show()
image.png
# Use the 'palette' argument
# palette参数设置颜色画板
sns.lmplot( x="sepal_length", y="sepal_width", data=df, fit_reg=False, hue='species', legend=True, palette="Set2")

# Move the legend to an empty part of the plot
#plt.legend(loc='lower right')

#sns.plt.show()
image.png

使用jointplot函数绘制边际图

# Custom the inside plot: options are: “scatter” | “reg” | “resid” | “kde” | “hex”
# kind参数设置绘图类型
sns.jointplot(x=df["sepal_length"], y=df["sepal_width"], kind='scatter')
sns.jointplot(x=df["sepal_length"], y=df["sepal_width"], kind='hex')
sns.jointplot(x=df["sepal_length"], y=df["sepal_width"], kind='kde')
image.png
image.png
image.png
# Then you can pass arguments to each type:
# 设置点和线颜色,大小
sns.jointplot(x=df["sepal_length"], y=df["sepal_width"], kind='scatter', s=200, color='m', edgecolor="skyblue", linewidth=2)

# Custom the color
sns.set(style="white", color_codes=True)
sns.jointplot(x=df["sepal_length"], y=df["sepal_width"], kind='kde', color="skyblue")
image.png
image.png

使用pairplot函数绘制配对散点图

# first
sns.pairplot(df, kind="scatter", hue="species", markers=["o", "s", "D"], palette="Set2")
plt.show()

# second: you can give other arguments with plot_kws.
sns.pairplot(df, kind="scatter", hue="species", plot_kws=dict(s=80, edgecolor="white", linewidth=2.5))
plt.show()
image.png
image.png

参考来源:https://python-graph-gallery.com/scatter-plot/

最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 159,835评论 4 364
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 67,598评论 1 295
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 109,569评论 0 244
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 44,159评论 0 213
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 52,533评论 3 287
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 40,710评论 1 222
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 31,923评论 2 313
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 30,674评论 0 203
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 34,421评论 1 246
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 30,622评论 2 245
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 32,115评论 1 260
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 28,428评论 2 254
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 33,114评论 3 238
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 26,097评论 0 8
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 26,875评论 0 197
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 35,753评论 2 276
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 35,649评论 2 271

推荐阅读更多精彩内容