python implements the code based on item collaborative filtering algorithm
- 2020-10-31 21:51:28
- OfStack
This test implements item-based collaborative filtering based on MovieLens data set. At present, it is only implemented on small samples. The main problem is that the calculation consumes too much memory, and the code will continue to be optimized and improved in the later stage.
Data set description: The data in ES5en.dat is the user's rating of the movie. Data format: UserID: : MovieID: : Rating: : Timestamp.
code
import pandas as pd
import numpy as np
import math
import os
import time
import datetime
os.chdir(r'f:\zxx\pthon_work\CF')
def loadData():
# Read in movies.dat, rating.dat,tags.dat
#mnames=['movie_id','title','genres']
#movies=pd.read_table(r'.\data\movies.dat',sep='::',header=None,names=mnames)
rnames=['UserID','MovieID','Rating','Timestamp']
all_ratings=pd.read_table(r'.\data\ratings.dat',sep='::',header=None,names=rnames,nrows=300000)
#tnames=['UserID','MovieID','Tag','Timestamp']
#tags=pd.read_table(r'.\data\tags.dat',sep='::',header=None,names=tnames)
return all_ratings
# Data Exploration: rating
def data_alay(ratings):
"""rating nums10000054, 3,
The sample : 1 122 5 838985046
col:'UserID','MovieID','Rating','Timestamp'
"""
#1 Three users only 1 Scores of films 1 time
UR=ratings.groupby([ratings['UserID'],ratings['MovieID']])
len(UR.size)
# Calculate the average score of each film , Number of films 10677
def avgRating(ratings):
movies_mean=ratings['Rating'].groupby(ratings['MovieID']).mean()# Calculate all user pairs of movies X Average score of
movies_id=movies_mean.index
movies_avg_rating=movies_mean.values
return movies_id,movies_avg_rating,movies_mean
# To calculate the phase of the film similarity matrix, namely to establish 10677*10677 matrix
def calculatePC(ratings):
movies_id,movies_avg_rating,movies_mean=avgRating(ratings)
#pc_mat=np.eye(3)# Build the identity matrix of film similarity
pc_dic={}
top_movie=len(movies_id)
for i in range(0,top_movie):
for j in range(i+1,top_movie):
movieAID=movies_id[i]
movieBID=movies_id[j]
see_moviesA_user=ratings['UserID'][ratings['MovieID']==movieAID]
see_moviesB_user=ratings['UserID'][ratings['MovieID']==movieBID]
join_user=np.intersect1d(see_moviesA_user.values,see_moviesB_user.values)# Simultaneously to the film A , B Rated user
movieA_avg=movies_mean[movieAID]
movieB_avg=movies_mean[movieBID]
key1=str(movieAID)+':'+str(movieBID)
key2=str(movieBID)+':'+str(movieAID)
value=twoMoviesPC(join_user,movieAID,movieBID,movieA_avg,movieB_avg,ratings)
pc_dic[key1]=value
pc_dic[key2]=value
#pc_mat[i][i+1]=twoMoviesPC(join_user,movieAID,movieBID,movieA_avg,movieB_avg,ratings)
#print ('---the %s, %d,%d:--movie %s--%s--pc is %f' % (key1,movieAID,movieBID,movieAID,movieBID,pc_dic[key1]))
return pc_dic
# Calculate the movie A With the film B Pearson similarity =sum(A-A^)*sum(B-B^)/sqrt(sum[(A-A^)*(A-A^)]*sum[(B-B^)*(B-B^)])
def twoMoviesPC(join_user,movieAID,movieBID,movieA_avg,movieB_avg,ratings):
cent_AB_sum=0.0# Similarity molecule
centA_sum=0.0# The denominator
centB_sum=0.0# The denominator
movieAB_pc=0.0# The movie A,B The similarity
count=0
for u in range(len(join_user)):
#print '---------',u
count=count+1
ratA=ratings['Rating'][ratings['UserID']==join_user[u]][ratings['MovieID']==movieAID].values[0]# User gives movie A score
ratB=ratings['Rating'][ratings['UserID']==join_user[u]][ratings['MovieID']==movieBID].values[0]# User gives movie B score
cent_AB=(ratA-movieA_avg)*(ratB-movieB_avg) # Decentralize the mean
centA_square=(ratA-movieA_avg)*(ratA-movieA_avg) # Take the mean squared
centB_square=(ratB-movieB_avg)*(ratB-movieB_avg)# Take the mean squared
cent_AB_sum=cent_AB_sum+cent_AB
centA_sum=centA_sum+centA_square
centB_sum=centB_sum+centB_square
if(centA_sum>0 and centB_sum>0 ):
movieAB_pc=cent_AB_sum/math.sqrt(centA_sum*centB_sum)
return movieAB_pc
"""
To predict the user U Be interested in those movies. points 3 Step,
1 ) user U In the past X A movie I saw yesterday.
2 ) Propose the user U Have seen the movie according to the user U Used to watch movies and compute users U Ratings of other films .
3) Select the highest rated movie to recommend to the user.
To predict the user U The movie C Score. points 3 Step :(just do this first)
1 ) user U In the past X A movie I saw yesterday.
2 ) use weighted decentralization formula to predict users U The movie C Ratings of the .
"""
# Date processing: -3 Days, and then convert to uinxtime
def timePro(last_rat_time,UserU):
lastDate= datetime.datetime.fromtimestamp(last_rat_time[UserU]) #unix To date,
date_sub3=lastDate+datetime.timedelta(days=-3)# Minus the 3 day
unix_sub3=time.mktime(date_sub3.timetuple())# The date to unix
return unix_sub3
# Take user end 1 Before scoring 3 Days to assess the movie to make predictions
def getHisRat(ratings,last_rat_time,UserUID):
unix_sub3= timePro(last_rat_time,UserUID)
UserU_info=ratings[ratings['UserID']==UserUID][ratings['Timestamp']>unix_sub3]
return UserU_info
# To predict the user U The movie C Ratings of the
def hadSeenMovieByUser(UserUID,MovieA,ratings,pc_dic,movies_mean):
pre_rating=0.0
last_rat_time=ratings['Timestamp'].groupby([ratings['UserID']]).max()# Get the user U Recently, 1 Date of secondary grading
UserU_info= getHisRat(ratings,last_rat_time,UserUID)# Get the user U Movies I've seen in the past
flag=0# Represents the new movie, the user U Whether to give the movie A Play too much
wmv=0.0# similarity *mv The average score is the sum of the mean
w=0.0# Sum of similarity
movie_userU=UserU_info['MovieID'].values# Movies seen by current users
if MovieA in movie_userU:
flag=1
pre_rating=UserU_info['Rating'][UserU_info['MovieID']==MovieA].values
else:
for mv in movie_userU:
key=str(mv)+':'+str(MovieA)
rat_U_mv=UserU_info['Rating'][UserU_info['MovieID']==mv][UserU_info['UserID']==UserUID].values# The user U Yeah, I've seen the movie. mv Ratings of the
wmv=(wmv+pc_dic[key]*(rat_U_mv-movies_mean[mv]))# similarity *mv The average score is the sum of the mean
w=(w+pc_dic[key])# The sum of the similarities between the movie and the new movie
#print ('---have seen mv %d with new mv %d,%f,%f'%(mv,MovieA,wmv,w))
pre_rating=(movies_mean[MovieA]+wmv/w)
print ('-flag:%d---User:%d rating movie:%d with %f score----' %(flag,UserUID,MovieA,pre_rating))
return pre_rating,flag
if __name__=='__main__':
all_ratings=loadData()
movie_num=100# Control the number of movies, only movies ID The data in this range is calculated, otherwise the data volume is too large
ratings=all_ratings[all_ratings['MovieID']<=movie_num]
movies_id,movies_avg_rating,movies_mean=avgRating(ratings)
pc_dic=calculatePC(ratings)# Movie similarity matrix
# To predict
UserUID=10# The current data set is movie only 4 . 7 .
MovieA=6
pre_rating,flag=hadSeenMovieByUser(UserUID,MovieA,ratings,pc_dic,movies_mean)
"----------------- test ID extract ------------------"
# select UserUID
ratings.head(10)# Once upon a time 10 Random selection in the row 1 A user ID, Such as: UserID=10
# View which movies the user has seen in the current dataset to make it easier to select new movies (to prevent selecting movies that the user has already seen)
ratings[ratings['UserID']==10]# The user has only seen movies in the current data set MovieID in(4 . 7) , you can choose not to 4 . 7 In the movie ID Make a prediction, for example 6.
Operation results:
[-flag:0---User:10 rating movie:6 with 4.115996 score----
]