Python 协同推荐作业

Posted by 韩同学的笔记本 on March 24, 2020

Excesice 1

1
2
import pandas as pd
pd.options.display.max_rows = 10
1
2
3
4
5
6
7
8
9
10
11
12
# 创建一个dataframe
raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'],
            'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'],
            'deaths': [523, 52, 25, 616, 43, 234, 523, 62, 62, 73, 37, 35],
            'battles': [5, 42, 2, 2, 4, 7, 8, 3, 4, 7, 8, 9],
            'size': [1045, 957, 1099, 1400, 1592, 1006, 987, 849, 973, 1005, 1099, 1523],
            'veterans': [1, 5, 62, 26, 73, 37, 949, 48, 48, 435, 63, 345],
            'readiness': [1, 2, 3, 3, 2, 1, 2, 3, 2, 1, 2, 3],
            'armored': [1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1],
            'deserters': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],
            'origin': ['Arizona', 'California', 'Texas', 'Florida', 'Maine', 'Iowa', 'Alaska', 'Washington', 'Oregon', 'Wyoming', 'Louisana', 'Georgia']}
army = pd.DataFrame(raw_data, columns = ['regiment', 'company', 'deaths', 'battles', 'size', 'veterans', 'readiness', 'armored', 'deserters', 'origin'])
1
army
regiment company deaths battles size veterans readiness armored deserters origin
0 Nighthawks 1st 523 5 1045 1 1 1 4 Arizona
1 Nighthawks 1st 52 42 957 5 2 0 24 California
2 Nighthawks 2nd 25 2 1099 62 3 1 31 Texas
3 Nighthawks 2nd 616 2 1400 26 3 1 2 Florida
4 Dragoons 1st 43 4 1592 73 2 0 3 Maine
... ... ... ... ... ... ... ... ... ... ...
7 Dragoons 2nd 62 3 849 48 3 1 31 Washington
8 Scouts 1st 62 4 973 48 2 0 2 Oregon
9 Scouts 1st 73 7 1005 435 1 0 3 Wyoming
10 Scouts 2nd 37 8 1099 63 2 1 2 Louisana
11 Scouts 2nd 35 9 1523 345 3 1 3 Georgia

12 rows × 10 columns

1
2
3
# 以'origin'作为该dataframe的index
army = army.set_index('origin')
army
regiment company deaths battles size veterans readiness armored deserters
origin
Arizona Nighthawks 1st 523 5 1045 1 1 1 4
California Nighthawks 1st 52 42 957 5 2 0 24
Texas Nighthawks 2nd 25 2 1099 62 3 1 31
Florida Nighthawks 2nd 616 2 1400 26 3 1 2
Maine Dragoons 1st 43 4 1592 73 2 0 3
... ... ... ... ... ... ... ... ... ...
Washington Dragoons 2nd 62 3 849 48 3 1 31
Oregon Scouts 1st 62 4 973 48 2 0 2
Wyoming Scouts 1st 73 7 1005 435 1 0 3
Louisana Scouts 2nd 37 8 1099 63 2 1 2
Georgia Scouts 2nd 35 9 1523 345 3 1 3

12 rows × 9 columns

1
2
3
4
# 写出至少2种方式访问某一列
print(army.loc[:,'regiment'])
print(army.get('regiment'))
print(army.filter(like='regiment'))
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
origin
Arizona       Nighthawks
California    Nighthawks
Texas         Nighthawks
Florida       Nighthawks
Maine           Dragoons
                 ...    
Washington      Dragoons
Oregon            Scouts
Wyoming           Scouts
Louisana          Scouts
Georgia           Scouts
Name: regiment, Length: 12, dtype: object
origin
Arizona       Nighthawks
California    Nighthawks
Texas         Nighthawks
Florida       Nighthawks
Maine           Dragoons
                 ...    
Washington      Dragoons
Oregon            Scouts
Wyoming           Scouts
Louisana          Scouts
Georgia           Scouts
Name: regiment, Length: 12, dtype: object
              regiment
origin                
Arizona     Nighthawks
California  Nighthawks
Texas       Nighthawks
Florida     Nighthawks
Maine         Dragoons
...                ...
Washington    Dragoons
Oregon          Scouts
Wyoming         Scouts
Louisana        Scouts
Georgia         Scouts

[12 rows x 1 columns]
1
2
3
4
5
# 写出至少2种方式访问某一行
print(army.loc['Arizona'])
print(army.iloc[0])
print(army.query('origin == "Arizona"'))
print(army.loc[army.index=='Arizona'])
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
regiment     Nighthawks
company             1st
deaths              523
battles               5
size               1045
veterans              1
readiness             1
armored               1
deserters             4
Name: Arizona, dtype: object
regiment     Nighthawks
company             1st
deaths              523
battles               5
size               1045
veterans              1
readiness             1
armored               1
deserters             4
Name: Arizona, dtype: object
           regiment company  deaths  battles  size  veterans  readiness  \
origin                                                                    
Arizona  Nighthawks     1st     523        5  1045         1          1   

         armored  deserters  
origin                       
Arizona        1          4  
           regiment company  deaths  battles  size  veterans  readiness  \
origin                                                                    
Arizona  Nighthawks     1st     523        5  1045         1          1   

         armored  deserters  
origin                       
Arizona        1          4  
1
2
# 选出deaths数据大于50且小于500的行
print(army.loc[(army['deaths'] > 50) & (army['deaths'] < 500)])
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
              regiment company  deaths  battles  size  veterans  readiness  \
origin                                                                       
California  Nighthawks     1st      52       42   957         5          2   
Iowa          Dragoons     1st     234        7  1006        37          1   
Washington    Dragoons     2nd      62        3   849        48          3   
Oregon          Scouts     1st      62        4   973        48          2   
Wyoming         Scouts     1st      73        7  1005       435          1   

            armored  deserters  
origin                          
California        0         24  
Iowa              1          4  
Washington        1         31  
Oregon            0          2  
Wyoming           0          3  
1
2
# 按readiness排序,对readiness相同的行按size排序
army.sort_values(by=['readiness', 'size'])
regiment company deaths battles size veterans readiness armored deserters
origin
Wyoming Scouts 1st 73 7 1005 435 1 0 3
Iowa Dragoons 1st 234 7 1006 37 1 1 4
Arizona Nighthawks 1st 523 5 1045 1 1 1 4
California Nighthawks 1st 52 42 957 5 2 0 24
Oregon Scouts 1st 62 4 973 48 2 0 2
... ... ... ... ... ... ... ... ... ...
Maine Dragoons 1st 43 4 1592 73 2 0 3
Washington Dragoons 2nd 62 3 849 48 3 1 31
Texas Nighthawks 2nd 25 2 1099 62 3 1 31
Florida Nighthawks 2nd 616 2 1400 26 3 1 2
Georgia Scouts 2nd 35 9 1523 345 3 1 3

12 rows × 9 columns

1
2
3
4
5
6
7
# 选出origin以'A'开头的行
print('方法一:')
print(army.loc[(army.index >= 'A') & (army.index < 'B')])
print()
print('方法二:')
print(army.loc[army.index.str.startswith('A')])
print()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
方法一:
           regiment company  deaths  battles  size  veterans  readiness  \
origin                                                                    
Arizona  Nighthawks     1st     523        5  1045         1          1   
Alaska     Dragoons     2nd     523        8   987       949          2   

         armored  deserters  
origin                       
Arizona        1          4  
Alaska         0         24  

方法二:
           regiment company  deaths  battles  size  veterans  readiness  \
origin                                                                    
Arizona  Nighthawks     1st     523        5  1045         1          1   
Alaska     Dragoons     2nd     523        8   987       949          2   

         armored  deserters  
origin                       
Arizona        1          4  
Alaska         0         24  

Excesice 2

1
2
3
# 读取数据
df = pd.read_csv('./datasets/student-mat.csv')
df.head()
school sex age address famsize Pstatus Medu Fedu Mjob Fjob ... famrel freetime goout Dalc Walc health absences G1 G2 G3
0 GP F 18 U GT3 A 4 4 at_home teacher ... 4 3 4 1 1 3 6 5 6 6
1 GP F 17 U GT3 T 1 1 at_home other ... 5 3 3 1 1 3 4 5 5 6
2 GP F 15 U LE3 T 1 1 at_home other ... 4 3 2 2 3 3 10 7 8 10
3 GP F 15 U GT3 T 4 2 health services ... 3 2 2 1 1 5 2 15 14 15
4 GP F 16 U GT3 T 3 3 other other ... 4 3 2 1 2 5 4 6 10 10

5 rows × 33 columns

1
2
# 本题只用到"school":"guardian"之间的列,将其取出
cur = df.loc[:,"school":"guardian"]
1
2
3
4
# 将Mjob和Fjob首字母大写
cur.Mjob = cur.Mjob.str.capitalize()
cur.Fjob = cur.Fjob.str.capitalize()
print(cur.head)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
<bound method NDFrame.head of     school sex  age address famsize Pstatus  Medu  Fedu      Mjob      Fjob  \
0       GP   F   18       U     GT3       A     4     4   At_home   Teacher   
1       GP   F   17       U     GT3       T     1     1   At_home     Other   
2       GP   F   15       U     LE3       T     1     1   At_home     Other   
3       GP   F   15       U     GT3       T     4     2    Health  Services   
4       GP   F   16       U     GT3       T     3     3     Other     Other   
..     ...  ..  ...     ...     ...     ...   ...   ...       ...       ...   
390     MS   M   20       U     LE3       A     2     2  Services  Services   
391     MS   M   17       U     LE3       T     3     1  Services  Services   
392     MS   M   21       R     GT3       T     1     1     Other     Other   
393     MS   M   18       R     LE3       T     3     2  Services     Other   
394     MS   M   19       U     LE3       T     1     1     Other   At_home   

     reason guardian  
0    course   mother  
1    course   father  
2     other   mother  
3      home   mother  
4      home   father  
..      ...      ...  
390  course    other  
391  course   mother  
392  course    other  
393  course   mother  
394  course   father  

[395 rows x 12 columns]>
1
2
# print最后五行的数据
print(cur.tail(5))
1
2
3
4
5
6
7
8
9
10
11
12
13
    school sex  age address famsize Pstatus  Medu  Fedu      Mjob      Fjob  \
390     MS   M   20       U     LE3       A     2     2  Services  Services   
391     MS   M   17       U     LE3       T     3     1  Services  Services   
392     MS   M   21       R     GT3       T     1     1     Other     Other   
393     MS   M   18       R     LE3       T     3     2  Services     Other   
394     MS   M   19       U     LE3       T     1     1     Other   At_home   

     reason guardian  
390  course    other  
391  course   mother  
392  course    other  
393  course   mother  
394  course   father  
1
2
3
# 创建一列'legal_drinker',若该学生年龄大于17,则该项为True,否则为False
cur['legal_drinker'] = cur.age > 17
print(cur.head())
1
2
3
4
5
6
7
8
9
10
11
12
13
  school sex  age address famsize Pstatus  Medu  Fedu     Mjob      Fjob  \
0     GP   F   18       U     GT3       A     4     4  At_home   Teacher   
1     GP   F   17       U     GT3       T     1     1  At_home     Other   
2     GP   F   15       U     LE3       T     1     1  At_home     Other   
3     GP   F   15       U     GT3       T     4     2   Health  Services   
4     GP   F   16       U     GT3       T     3     3    Other     Other   

   reason guardian  legal_drinker  
0  course   mother           True  
1  course   father          False  
2   other   mother          False  
3    home   mother          False  
4    home   father          False  
1
2
3
# 选出父母职业相同的行,不包括'other'
MF_same = cur.loc[(cur.Mjob==cur.Fjob)&(cur.Mjob!="Other")]
print(MF_same.head())
1
2
3
4
5
6
7
8
9
10
11
12
13
   school sex  age address famsize Pstatus  Medu  Fedu      Mjob      Fjob  \
16     GP   F   16       U     GT3       T     4     4  Services  Services   
18     GP   M   17       U     GT3       T     3     2  Services  Services   
21     GP   M   15       U     GT3       T     4     4    Health    Health   
25     GP   F   16       U     GT3       T     2     2  Services  Services   
29     GP   M   16       U     GT3       T     4     4   Teacher   Teacher   

        reason guardian  legal_drinker  
16  reputation   mother          False  
18      course   mother          False  
21       other   father          False  
25        home   mother          False  
29        home   mother          False  

Excesice 3

MovieLens 1M数据集含有来自6000名用户对4000部电影的100万条评分数据。它分为三个表:评分、用户信息和电影信息。

1
2
3
4
5
6
7
8
9
10
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_csv('./datasets/ml-1m/users.dat', sep='::',
                      header=None, names=unames, engine='python')

rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('./datasets/ml-1m/ratings.dat', sep='::',
                        header=None, names=rnames, engine='python')
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_csv('./datasets/ml-1m/movies.dat', sep='::',
                       header=None, names=mnames, engine='python')
1
2
3
# 将3个dataframe合并成一个dataframe
data = pd.merge(users, pd.merge(ratings, movies))
data
user_id gender age occupation zip movie_id rating timestamp title genres
0 1 F 1 10 48067 1193 5 978300760 One Flew Over the Cuckoo's Nest (1975) Drama
1 1 F 1 10 48067 661 3 978302109 James and the Giant Peach (1996) Animation|Children's|Musical
2 1 F 1 10 48067 914 3 978301968 My Fair Lady (1964) Musical|Romance
3 1 F 1 10 48067 3408 4 978300275 Erin Brockovich (2000) Drama
4 1 F 1 10 48067 2355 5 978824291 Bug's Life, A (1998) Animation|Children's|Comedy
... ... ... ... ... ... ... ... ... ... ...
1000204 6040 M 25 6 11106 1150 5 956715569 Return of Martin Guerre, The (Retour de Martin... Drama
1000205 6040 M 25 6 11106 2751 1 956716438 From the Hip (1987) Comedy
1000206 6040 M 25 6 11106 3289 5 956704305 Not One Less (Yi ge dou bu neng shao) (1999) Drama
1000207 6040 M 25 6 11106 722 3 960971992 Haunted World of Edward D. Wood Jr., The (1995) Documentary
1000208 6040 M 25 6 11106 2503 5 956704191 Apple, The (Sib) (1998) Drama

1000209 rows × 10 columns

1
2
3
4
5
6
7
8
# 过滤掉评分数据不够250条的电影
# hint:使用groupby()
grouped = ratings[['rating','movie_id']].groupby('movie_id')
cnt = grouped.count().rename(columns={"rating":"movie_rate_count"}).reset_index()

data = pd.merge(data,cnt)
data = data.loc[data['movie_rate_count'] >= 250]
data.head()
user_id gender age occupation zip movie_id rating timestamp title genres movie_rate_count
0 1 F 1 10 48067 1193 5 978300760 One Flew Over the Cuckoo's Nest (1975) Drama 1725
1 2 M 56 16 70072 1193 5 978298413 One Flew Over the Cuckoo's Nest (1975) Drama 1725
2 12 M 25 12 32793 1193 4 978220179 One Flew Over the Cuckoo's Nest (1975) Drama 1725
3 15 M 25 7 22903 1193 4 978199279 One Flew Over the Cuckoo's Nest (1975) Drama 1725
4 17 M 50 1 95350 1193 5 978158471 One Flew Over the Cuckoo's Nest (1975) Drama 1725
1
2
3
4
# 按性别计算每部电影的平均得分
# hint: 使用pivot_table()
tmp = pd.pivot_table(data, index=['gender','movie_id'], values=['rating'])
print(tmp.head())
1
2
3
4
5
6
7
                   rating
gender movie_id          
F      1         4.187817
       2         3.278409
       3         3.073529
       5         3.212963
       6         3.682171
1
2
3
# 找出男性和女性观众分歧最大的电影
idx = (tmp.loc['F']-tmp.loc['M']).sort_values(by='rating',ascending=False).iloc[0]
print(movies.loc[movies['movie_id']==idx.name])
1
2
      movie_id                 title           genres
1072      1088  Dirty Dancing (1987)  Musical|Romance

Excesice 4

由于1M数据计算相似度矩阵易遇上内存问题,我们在MovieLens 100k上用协同过滤做简单的电影推荐。

参考资料:

  1. Introduction to Recommender System. Part 1 (Collaborative Filtering, Singular Value Decomposition)

  2. 推荐算法常见评测方法

  3. [机器学习]推荐系统之协同过滤算法

1
2
3
4
5
6
7
8
9
10
11
import numpy as np
import pandas as pd
header = ['user_id', 'movie_id', 'rating', 'timestamp']
df = pd.read_csv('./datasets/ml-100k/u.data', sep='\t', names=header)

# user数, movies数目, 决定了rating矩阵大小
n_users = df.user_id.unique().shape[0]
n_items = df.movie_id.unique().shape[0]
# 使用Sklearn 将一部分数据分为测试集
from sklearn import model_selection
train_data, test_data = model_selection.train_test_split(df, test_size=0.25)
1
2
3
4
5
6
7
8
9
10
train_data_matrix = np.zeros((n_users, n_items))
test_data_matrix = np.zeros((n_users, n_items))
# TODO: 填写评分矩阵train_data_matrix, test_data_matrix

blank = pd.DataFrame(np.arange(1,n_items+1).transpose(), columns=('movie_id',), dtype=int)
trd = pd.pivot_table(train_data, index=('user_id', 'movie_id'), values='rating')
ted = pd.pivot_table(test_data, index=('user_id', 'movie_id'), values='rating')
for x in range(n_users):
    train_data_matrix[x] = pd.merge(blank, trd.loc[x+1].reset_index(),how='outer').fillna(trd.loc[x+1].mean()['rating']).to_numpy(dtype=float).transpose()[1]
    test_data_matrix[x] = pd.merge(blank, ted.loc[x+1].reset_index(),how='outer').fillna(ted.loc[x+1].mean()['rating']).to_numpy(dtype=float).transpose()[1]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 计算  user-user 相似度矩阵  和 item-item 相似度矩阵
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = 1.0 - pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = 1.0 - pairwise_distances(train_data_matrix.T, metric='cosine')
# 通过相似度矩阵进行预测
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + \
            similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return pred
1
2
3
4
5
6
7
8
9
10
11
12
13
#TODO: 在测试集上评估user-based 与 item-based 协同过滤算法的表现,评估指标采用RMSE(详见参考资料2)
ubpre = predict(train_data_matrix, user_similarity, 'user')
ibpre = predict(train_data_matrix, item_similarity, 'item')
usum = 0.0
isum = 0.0
foo = 0.0
for i in test_data.iterrows():
    usum += np.square(i[1].rating - ubpre[i[1].user_id-1][i[1].movie_id-1])
    isum += np.square(i[1].rating - ibpre[i[1].user_id-1][i[1].movie_id-1])
    foo += np.square(i[1].rating - 3)

T=test_data.shape[0]
print(np.sqrt(usum/T), np.sqrt(isum/T), np.sqrt(foo/T))
1
1.0312188314286472 1.0493218081215114 1.2511914321957291