×

# 熟悉numpy和pandas

2017年5月22日12:21:43：几乎所有的长文章，都转移到了 zhouww.com，请大家到新网址阅读

### 基于用户的推荐

• 怎么判断相似？
• 曼哈顿距离
• 欧氏距离
• 皮尔逊系数
• 余弦定理
• 怎么推荐？
• 找到与之最相似的用户，推荐这位用户评价最高的内容
• 找到与之比较相似的用户，加权处理

#### 距离

``````# -*- coding: utf-8 -*-
__author__ = 'duohappy'

import numpy as np
import pandas as pd

users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, "Vampire Weekend": 2.0},
"Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},
"Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0},
"Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 2.0},
"Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, "The Strokes": 4.0, "Vampire Weekend": 1.0},
"Jordyn":  {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 4.0},
"Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0},
"Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, "Slightly Stoopid": 2.5, "The Strokes": 3.0}
}

def manhattan_pd(user1, user2):

# nan在sum过程中以0替代
distance = np.abs(user1 - user2).sum()

return distance

def topN_pd(user, users_df):

users_df = users_df.drop(user.name, axis=1)  # 注意！users.df.drop() Return new object with labels in requested axis removed.
# print(users_df)

distances = []

for other_user in users_df:
distance = manhattan_pd(user, users_df[other_user])   # users_df[other_user] 和 users_df.other_user表达含义不同！
distances.append((other_user, distance))

distances = sorted(distances, key=lambda x:x[-1])

return distances

def recommend_pd(user, users_df):

distances = topN_pd(user, users_df)

nearest_user = distances[0][0]

# users_df[nearest_user] 得到距离user最近的用户
# user.isnull() 得到user没有试过的选择
# users_df[nearest_user][user.isnull()] 得到user没有试过的选择在nearest_user中的情况
# users_df[nearest_user][user.isnull()].dropna() 丢弃nearest_user也没有试过的选择，剩下的就可以推荐了
recommendations = users_df[nearest_user][user.isnull()].dropna()
recommendations = recommendations.sort_values(ascending=False)

return recommendations

if __name__ == '__main__':

# print(manhattan(users["Angelica"], users["Bill"]))
# print(topN('Angelica', users))
print(recommend('Bill', users))

users_df = pd.DataFrame(users)
# print(users_df)
# manhattan_pd(users_df.Angelica, users_df.Bill)
# print(topN_pd(users_df.Angelica, users_df))
recommend_pd(users_df.Bill, users_df)
``````

#### 皮尔逊系数

``````# -*- coding: utf-8 -*-
__author__ = 'duohappy'

import numpy as np
import pandas as pd

users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, "Vampire Weekend": 2.0},
"Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},
"Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0},
"Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 2.0},
"Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, "The Strokes": 4.0, "Vampire Weekend": 1.0},
"Jordyn":  {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 4.0},
"Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0},
"Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, "Slightly Stoopid": 2.5, "The Strokes": 3.0}
}

def pearson(user1, user2):

# 贸然地给填充0是不可行的，别人就没有看过这部电影，怎么轻易断定别人的评分是0
# user1 = user1.fillna(0)
# user2 = user2.fillna(0)

# user1.isnull()得到包含True/False的Series
# user1[user1.isnull()]得到值为NaN的Series
# user1[user1.isnull()].index得到Series的index
# list
# 为了作并集操作，set
user1_nan_labels = set(list(user1[user1.isnull()].index))
user2_nan_labels = set(list(user2[user2.isnull()].index))
all_nan_labels = user1_nan_labels | user2_nan_labels

# 只用两者都有的内容计算
# 丢弃所有的NaN
user1 = user1.drop(all_nan_labels)
user2 = user2.drop(all_nan_labels)

user1_sub_mean = user1 - user1.mean()
user2_sub_mean = user2 - user2.mean()

# 向量的内积
num = (user1_sub_mean).dot(user2_sub_mean)
# 向量内积
den = np.sqrt(user1_sub_mean.dot(user1_sub_mean)) * np.sqrt(user2_sub_mean.dot(user2_sub_mean))

return num/den

if __name__ == '__main__':

users_df = pd.DataFrame(users)

pearson(users_df.Angelica, users_df.Bill)

# print(users_df.Angelica)
# print(users_df.Hailey)

``````

#### 余弦定理

``````# -*- coding: utf-8 -*-
__author__ = 'duohappy'

import numpy as np
import pandas as pd

users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, "Vampire Weekend": 2.0},
"Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},
"Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0},
"Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 2.0},
"Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, "The Strokes": 4.0, "Vampire Weekend": 1.0},
"Jordyn":  {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 4.0},
"Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0},
"Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, "Slightly Stoopid": 2.5, "The Strokes": 3.0}
}

def cosine(user1, user2):
# 余弦定理用0代替空值
user1 = user1.fillna(0)
user2 = user2.fillna(0)

num = user1.dot(user2)
den = np.sqrt(user1.dot(user1))*np.sqrt(user2.dot(user2))

return num/den

if __name__ == '__main__':
users_df = pd.DataFrame(users)

cosine(users_df.Angelica, users_df.Veronica)

``````

#### K最邻近算法

``````# -*- coding: utf-8 -*-
__author__ = 'duohappy'

import numpy as np
import pandas as pd

from pearson_coefficient import pearson

users={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5,
'The Night Listener': 3.0},
'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5,
'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0,
'You, Me and Dupree': 3.5},
'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
'Superman Returns': 3.5, 'The Night Listener': 4.0},
'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
'The Night Listener': 4.5, 'Superman Returns': 4.0,
'You, Me and Dupree': 2.5},
'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
'You, Me and Dupree': 2.0},
'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}}

def topN(user, users_df):

# 和其他用户比较
# 丢弃掉user.name这一列
users_df = users_df.drop(user.name, axis=1)
# print(users_df) # 函数内重新定义了一个变量users_df，而在外部的users_df并没有改变，users_df.drop()方法并不会改变本身的users_df

similarity = []  # 相似度

for other_user in users_df:
positive_sim = pearson(users_df[other_user], user)  # 其他人和user的皮尔逊系数
if positive_sim > 0: # 只保留正相关系数
similarity.append((other_user, positive_sim))

similarity = sorted(similarity, key=lambda x: x[-1], reverse= True)  # 排序

print(similarity)

return similarity

def recommend(user, users_df):
sim = topN(user, users_df)  # 得到相似度

# sim = [sim[0]]
all_choice = []  # 所有的选择

for item in sim:
all_choice.extend(list(users_df[item[0]].index))  # 得到其他人的所有选择

available_choice = set(all_choice) - set(list(user.dropna().index))  # 得到自己没有做过的选择，注意一定要dropna

print(available_choice)

recom = []  # 推荐

# 每一个没有做过的选择
for choice in available_choice:
sum_score = 0  # 总的评分
sum_sim = 0  # 总的相似度
for item in sim: 　# 每一个其他用户
score = users_df.loc[choice, item[0]] # 其他用户的选择 评分
if not np.isnan(score):  # 如果这个评分不是nan的话
sum_score += item[-1] * users_df.loc[choice, item[0]]  # 计算总分
sum_sim += item[-1]  # 总的相似度

recom.append((choice, sum_score/sum_sim))  # sum_score/sum_sim

recom = sorted(recom, key=lambda x: x[-1], reverse=True)

return recom

if __name__ == '__main__':
users_df = pd.DataFrame(users)

# topN(users_df.Sam, users_df)

recommend(users_df.Toby, users_df)
``````

### 基于物品的推荐

#### 修正的余弦定理

``````# -*- coding: utf-8 -*-
__author__ = 'duohappy'

import numpy as np
import pandas as pd

users = {"David": {"Imagine Dragons": 3, "Daft Punk": 5,
"Lorde": 4, "Fall Out Boy": 1},
"Matt": {"Imagine Dragons": 3, "Daft Punk": 4,
"Lorde": 4, "Fall Out Boy": 1},
"Ben": {"Kacey Musgraves": 4, "Imagine Dragons": 3,
"Lorde": 3, "Fall Out Boy": 1},
"Chris": {"Kacey Musgraves": 4, "Imagine Dragons": 4,
"Daft Punk": 4, "Lorde": 3, "Fall Out Boy": 1},
"Tori": {"Kacey Musgraves": 5, "Imagine Dragons": 4,
"Daft Punk": 5, "Fall Out Boy": 3}}

def cosine(item1, item2, users_df):
# item2.notnull() item2不为nan的位置
# item1[item2.notnull()] 取出item1中对应的value
# item1[item2.notnull()].dropna() 丢弃item1中特有的nan
# item1 = item1[item2.notnull()].dropna()，这个种写法会影响下面一条语句
# item1_dropna = item1[item2.notnull()].dropna()
# item2_dropna = item2[item1.notnull()].dropna()

# 用&而不是and
# 如果item1和item2中对应位置有一个nan，那么得到的bool array对应位置就是False
all_not_nan = item1.notnull().values & item2.notnull().values

item1 = item1[all_not_nan]
item2 = item2[all_not_nan]

# item1.index和item2.index包含同样的内容
# 必须要dropna后计算mean
data = [users_df[name].dropna().mean() for name in item1.index]
# 组成一个对应的Series
item1_mean = pd.Series(data, item1.index)
item2_mean = pd.Series(data, item2.index)

item1_sub_mean = item1 - item1_mean
item2_sub_mean = item2 - item2_mean

# print(item1_sub_mean)
# print(item2_sub_mean)

num = item1_sub_mean.dot(item2_sub_mean)
den = np.sqrt(item1_sub_mean.dot(item1_sub_mean)) * np.sqrt(item2_sub_mean.dot(item2_sub_mean))

# print(num / den)

return num/den

# 得到修正的余弦相似度矩阵
def cosineMatrix(users_df):

# 构造这个矩阵的大致样子
cosine_matrix = pd.DataFrame(np.zeros([5, 5]), index=users_df.index, columns=users_df.index)

# 定义一个索引i
i = 0

# 遍历行列，计算元素具体值
for row in users_df.index:
i += 1
# 为了避免重复计算，使用切片
for column in users_df.index[:i]:
if row == column:
cosine_matrix.loc[row, column] = 1
continue
cosine_matrix.loc[row, column] = cosine(users_df.loc[row], users_df.loc[column], users_df)
cosine_matrix.loc[column, row] = cosine_matrix.loc[row, column]

cosine_matrix.to_csv('./tmp.csv')

return cosine_matrix

def predict_score(user, item, users_df):
MAX = 5
MIN = 1

all_nan = user.notnull()

user = user[all_nan]
cosine_item = cosineMatrix(users_df)[item.name][all_nan]

# print(user)
# print(cosine_item)

user = (2*(user - MIN) - (MAX - MIN))/(MAX - MIN)
# print(user)

num = user.dot(cosine_item)
den = np.abs(cosine_item).sum()

nr = num/den
r = 1/2*((nr + 1) * (MAX - MIN)) + MIN

return nr, r

if __name__ == '__main__':
users_df = pd.DataFrame(users)

print(users_df)
# print(users_df.loc['Daft Punk'], users_df.loc['Fall Out Boy'])

# DataFrame取一行使用loc
# cosine(users_df.loc['Kacey Musgraves'], users_df.loc['Imagine Dragons'], users_df)

# cosineMatrix(users_df)
predict_score(users_df.David, users_df.loc['Kacey Musgraves'], users_df)
``````

#### slope one

``````# -*- coding: utf-8 -*-
__author__ = 'duohappy'

import numpy as np
import pandas as pd

users = {"Amy": {"Taylor Swift": 4, "PSY": 3, "Whitney Houston": 4},
"Ben": {"Taylor Swift": 5, "PSY": 2},
"Clara": {"PSY": 3.5, "Whitney Houston": 4},
"Daisy": {"Taylor Swift": 5, "Whitney Houston": 3}}

def slope_one_func(item1, item2):

all_not_nan = item1.notnull() & item2.notnull()

item1 = item1[all_not_nan]
item2 = item2[all_not_nan]

card_s = len(item1)

dev = ((item1 - item2) / card_s).sum()

return dev

def slop_one_matrix(users_df):

matrix = pd.DataFrame(np.zeros((3, 3)), index=users_df.index, columns=users_df.index)

i = 0
for row in users_df.index:
i += 1
for column in users_df.index[:i]:
if row == column:
matrix.loc[row, column] = 0
continue
matrix.loc[row, column] = slope_one_func(users_df.loc[row], users_df.loc[column])
matrix.loc[column, row] = -matrix.loc[row, column]

return matrix

if __name__ == '__main__':

users_df = pd.DataFrame(users)

print(users_df)
# slope_one_func(users_df.loc['PSY'], users_df.loc['Taylor Swift'])

slop_one_matrix(users_df)

``````