# Numpy教程（2） 数据分析的重要功能

Numpy是python数据分析和科学计算的核心软件包。这是numpy教程的第2部分。在这一部分中，我将详细介绍numpy的高级功能，这些功能对于数据分析和操作非常重要。

photo by kevin-horstmann

### 1. 使用np.where获取满足给定条件的索引位置

``````# Create an array
import numpy as np
arr_rand = np.array([8, 8, 3, 7, 7, 0, 4, 2, 5, 2])
print("Array: ", arr_rand)

# Positions where value > 5
index_gt5 = np.where(arr_rand > 5)
print("Positions where value > 5: ", index_gt5)

#> Array:  [8 8 3 7 7 0 4 2 5 2]
#> Positions where value > 5:  (array([0, 1, 3, 4]),)
``````

``````# Take items at given index
arr_rand.take(index_gt5)

#> array([[8, 8, 7, 7]])
``````

``````# If value > 5, then yield 'gt5' else 'le5'
np.where(arr_rand > 5, 'gt5', 'le5')
#> array(['gt5', 'gt5', 'le5', 'gt5', 'gt5', 'le5', 'le5', 'le5', 'le5', 'le5'],dtype='<U3')
``````

``````# Location of the max
print('Position of max value: ', np.argmax(arr_rand))

# Location of the min
print('Position of min value: ', np.argmin(arr_rand))

#> Position of max value:  0
#> Position of min value:  5
``````

### 2. 将数据导入和导出为csv文件

``````# Turn off scientific notation
np.set_printoptions(suppress=True)

# Import data from csv file url
path = 'https://raw.githubusercontent.com/selva86/datasets/master/Auto.csv'
data = np.genfromtxt(path, delimiter=',', skip_header=1, filling_values=-999, dtype='float')
data[:3]  # see first 3 rows

#> array([[   18. ,     8. ,   307. ,   130. ,  3504. ,    12. ,    70. ,
#>             1. ,  -999. ],
#>        [   15. ,     8. ,   350. ,   165. ,  3693. ,    11.5,    70. ,
#>             1. ,  -999. ],
#>        [   18. ,     8. ,   318. ,   150. ,  3436. ,    11. ,    70. ,
#>             1. ,  -999. ]])
``````

### 2.1 处理包含数字和文本列的数据集

``````# data2 = np.genfromtxt(path, delimiter=',', skip_header=1, dtype='object')
data2 = np.genfromtxt(path, delimiter=',', skip_header=1, dtype=None)
data2[:3]  # see first 3 rows

#> array([( 18., 8,  307., 130, 3504,  12. , 70, 1, b'"chevrolet chevelle malibu"'),
#>        ( 15., 8,  350., 165, 3693,  11.5, 70, 1, b'"buick skylark 320"'),
#>        ( 18., 8,  318., 150, 3436,  11. , 70, 1, b'"plymouth satellite"')],
#> dtype=[('f0', '<f8'), ('f1', '<i8'), ('f2', '<f8'), ('f3', '<i8'), ('f4', '<i8'), ('f5', '<f8'), ('f6', '<i8'), ('f7', '<i8'), ('f8', 'S38')])
``````

``````# Save the array as a csv file
np.savetxt("out.csv", data, delimiter=",")
``````

### 3. 保存和加载numpy对象

Numpy为此提供了.npy和.npz文件类型。

``````# Save single numpy array object as .npy file
np.save('myarray.npy', arr2d)

# Save multile numy arrays as a .npz file
np.savez('array.npz', arr2d_f, arr2d_b)
``````

``````# Load a .npy file
print(a)

#> [[0 1 2]
#>  [3 4 5]
#>  [6 7 8]]
``````

``````# Load a .npz file
print(b.files)
b['arr_0']

#> ['arr_0', 'arr_1']

#> array([[ 0.,  1.,  2.],
#>        [ 3.,  4.,  5.],
#>        [ 6.,  7.,  8.]])
``````

### 4. 连接两个numpy数组的列式和行式###

• 方法1：利用`np.concatenate`将轴参数更改为0和1
• 方法2：`np.vstack``np.hstack`
• 方法3：`np.r_``np.c_`

``````a = np.zeros([4, 4])
b = np.ones([4, 4])
print(a)
print(b)

#> [[ 0.  0.  0.  0.]
#>  [ 0.  0.  0.  0.]
#>  [ 0.  0.  0.  0.]
#>  [ 0.  0.  0.  0.]]

#> [[ 1.  1.  1.  1.]
#>  [ 1.  1.  1.  1.]
#>  [ 1.  1.  1.  1.]
#>  [ 1.  1.  1.  1.]]
``````

``````# Vertical Stack Equivalents (Row wise)
np.concatenate([a, b], axis=0)
np.vstack([a,b])
np.r_[a,b]

#> array([[ 0.,  0.,  0.,  0.],
#>        [ 0.,  0.,  0.,  0.],
#>        [ 0.,  0.,  0.,  0.],
#>        [ 0.,  0.,  0.,  0.],
#>        [ 1.,  1.,  1.,  1.],
#>        [ 1.,  1.,  1.,  1.],
#>        [ 1.,  1.,  1.,  1.],
#>        [ 1.,  1.,  1.,  1.]])
``````

``````# Horizontal Stack Equivalents (Coliumn wise)
np.concatenate([a, b], axis=1)
np.hstack([a,b])
np.c_[a,b]

#> array([[ 0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.],
#>        [ 0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.],
#>        [ 0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.],
#>        [ 0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.]])
``````

``````np.r_[[1,2,3], 0, 0, [4,5,6]]

#> array([1, 2, 3, 0, 0, 4, 5, 6])
``````

### 5. 根据一列或多列对numpy数组进行排序

``````arr = np.random.randint(1,6, size=[8, 4])

arr
#> array([[3, 3, 2, 1],
#>        [1, 5, 4, 5],
#>        [3, 1, 4, 2],
#>        [3, 4, 5, 5],
#>        [2, 4, 5, 5],
#>        [4, 4, 4, 2],
#>        [2, 4, 1, 3],
#>        [2, 2, 4, 3]])
``````

``````# Sort each columns of arr
np.sort(arr, axis=0)

#> array([[1, 1, 1, 1],
#>        [2, 2, 2, 2],
#>        [2, 3, 4, 2],
#>        [2, 4, 4, 3],
#>        [3, 4, 4, 3],
#>        [3, 4, 4, 5],
#>        [3, 4, 5, 5],
#>        [4, 5, 5, 5]])
``````

### 5.1 使用argsort对基于1列的numpy数组进行排序

``````# Get the index positions that would sort the array
x = np.array([1, 10, 5, 2, 8, 9])
sort_index = np.argsort(x)
print(sort_index)

#> [0 3 2 4 5 1]
``````

``````x[sort_index]
#> array([ 1,  2,  5,  8,  9, 10])
``````

``````# Argsort the first column
sorted_index_1stcol = arr[:, 0].argsort()

# Sort 'arr' by first column without disturbing the integrity of rows
arr[sorted_index_1stcol]
#> array([[1, 5, 4, 5],
#>        [2, 4, 5, 5],
#>        [2, 4, 1, 3],
#>        [2, 2, 4, 3],
#>        [3, 3, 2, 1],
#>        [3, 1, 4, 2],
#>        [3, 4, 5, 5],
#>        [4, 4, 4, 2]])
``````

``````# Descending sort
arr[sorted_index_1stcol[::-1]]
#> array([[4, 4, 4, 2],
#>        [3, 4, 5, 5],
#>        [3, 1, 4, 2],
#>        [3, 3, 2, 1],
#>        [2, 2, 4, 3],
#>        [2, 4, 1, 3],
#>        [2, 4, 5, 5],
#>        [1, 5, 4, 5]])
``````

### 5.2 如何基于2列或更多列对numpy数组进行排序？

``````# Sort by column 0, then by column 1
lexsorted_index = np.lexsort((arr[:, 1], arr[:, 0]))
arr[lexsorted_index]

#> array([[1, 5, 4, 5],
#>        [2, 2, 4, 3],
#>        [2, 4, 5, 5],
#>        [2, 4, 1, 3],
#>        [3, 1, 4, 2],
#>        [3, 3, 2, 1],
#>        [3, 4, 5, 5],
#>        [4, 4, 4, 2]])
``````

### 6. 使用日期###

Numpy通过`np.datetime64`实现日期对象，该对象支持精度直到纳秒。你可以使用标准的YYYY-MM-DD格式的日期字符串创建这样一个对象

``````# Create a datetime64 object
date64 = np.datetime64('2018-02-04 23:10:10')
date64
#> numpy.datetime64('2018-02-04T23:10:10')
``````

``````# Drop the time part from the datetime64 object
dt64 = np.datetime64(date64, 'D')
dt64
#> numpy.datetime64('2018-02-04')
``````

``````# Create the timedeltas (individual units of time)
tenminutes = np.timedelta64(10, 'm')  # 10 minutes
tenseconds = np.timedelta64(10, 's')  # 10 seconds
tennanoseconds = np.timedelta64(10, 'ns')  # 10 nanoseconds

print('Add 10 days: ', dt64 + 10)
print('Add 10 minutes: ', dt64 + tenminutes)
print('Add 10 seconds: ', dt64 + tenseconds)
print('Add 10 nanoseconds: ', dt64 + tennanoseconds)
``````

`dt64`转换回字符串。

``````# Convert np.datetime64 back to a string
np.datetime_as_string(dt64)
#> '2018-02-04'
``````

``````print('Date: ', dt64)
print("Is it a business day?: ", np.is_busday(dt64))
print("Add 2 business days, rolling forward to nearest biz day: ", np.busday_offset(dt64, 2, roll='forward'))
print("Add 2 business days, rolling backward to nearest biz day: ", np.busday_offset(dt64, 2, roll='backward'))
#> Date:  2018-02-04
#> Is it a business day?:  False
#> Add 2 business days, rolling forward to nearest biz day:  2018-02-07
#> Add 2 business days, rolling backward to nearest biz day:  2018-02-06
``````

### 6.1 创建一个日期序列

``````#Create date sequence
dates = np.arange(np.datetime64('2018-02-01'), np.datetime64('2018-02-10'))
print(dates)

# Check if its a business day
np.is_busday(dates)
#> ['2018-02-01' '2018-02-02' '2018-02-03' '2018-02-04' '2018-02-05'
#>  '2018-02-06' '2018-02-07' '2018-02-08' '2018-02-09']

array([ True,  True, False, False,  True,  True,  True,  True,  True], dtype=bool)
``````

### 6.2 将`numpy.datetime64`转换为`datetime.datetime`对象

``````# Convert np.datetime64 to datetime.datetime
import datetime
dt = dt64.tolist()
dt
#> datetime.date(2018, 2, 4)
``````

``````print('Year: ', dt.year)
print('Day of month: ', dt.day)
print('Month of year: ', dt.month)
print('Day of Week: ', dt.weekday())  # Sunday
#> Year:  2018
#> Day of month:  4
#> Month of year:  2
#> Day of Week:  6
``````