python implements the code based on item collaborative filtering algorithm

  • 2020-10-31 21:51:28
  • OfStack

This test implements item-based collaborative filtering based on MovieLens data set. At present, it is only implemented on small samples. The main problem is that the calculation consumes too much memory, and the code will continue to be optimized and improved in the later stage.

Data set description: The data in ES5en.dat is the user's rating of the movie. Data format: UserID: : MovieID: : Rating: : Timestamp.

code


import pandas as pd
import numpy as np
import math 
import os
import time
import datetime

os.chdir(r'f:\zxx\pthon_work\CF')

def loadData():
 # Read in movies.dat, rating.dat,tags.dat
 #mnames=['movie_id','title','genres']
 #movies=pd.read_table(r'.\data\movies.dat',sep='::',header=None,names=mnames)

 rnames=['UserID','MovieID','Rating','Timestamp']
 all_ratings=pd.read_table(r'.\data\ratings.dat',sep='::',header=None,names=rnames,nrows=300000)

 #tnames=['UserID','MovieID','Tag','Timestamp']
 #tags=pd.read_table(r'.\data\tags.dat',sep='::',header=None,names=tnames)
 return all_ratings

# Data Exploration: rating
def data_alay(ratings):
 """rating nums10000054, 3, 
  The sample   :  1  122  5 838985046
 col:'UserID','MovieID','Rating','Timestamp'
  """
 #1 Three users only 1 Scores of films 1 time 
 UR=ratings.groupby([ratings['UserID'],ratings['MovieID']])
 len(UR.size)

# Calculate the average score of each film , Number of films 10677
def avgRating(ratings):
 movies_mean=ratings['Rating'].groupby(ratings['MovieID']).mean()# Calculate all user pairs of movies X Average score of 
 movies_id=movies_mean.index
 movies_avg_rating=movies_mean.values
 return movies_id,movies_avg_rating,movies_mean

# To calculate the phase of the film similarity matrix, namely to establish 10677*10677 matrix 
def calculatePC(ratings):
 movies_id,movies_avg_rating,movies_mean=avgRating(ratings)
 #pc_mat=np.eye(3)# Build the identity matrix of film similarity 
 pc_dic={}
 top_movie=len(movies_id)
 for i in range(0,top_movie):
  for j in range(i+1,top_movie):
   movieAID=movies_id[i]
   movieBID=movies_id[j]
   see_moviesA_user=ratings['UserID'][ratings['MovieID']==movieAID]
   see_moviesB_user=ratings['UserID'][ratings['MovieID']==movieBID]
   join_user=np.intersect1d(see_moviesA_user.values,see_moviesB_user.values)# Simultaneously to the film A , B Rated user 
   movieA_avg=movies_mean[movieAID]
   movieB_avg=movies_mean[movieBID]
   key1=str(movieAID)+':'+str(movieBID)
   key2=str(movieBID)+':'+str(movieAID)
   value=twoMoviesPC(join_user,movieAID,movieBID,movieA_avg,movieB_avg,ratings)
   pc_dic[key1]=value   
   pc_dic[key2]=value      
   #pc_mat[i][i+1]=twoMoviesPC(join_user,movieAID,movieBID,movieA_avg,movieB_avg,ratings)
   #print ('---the %s, %d,%d:--movie %s--%s--pc is %f' % (key1,movieAID,movieBID,movieAID,movieBID,pc_dic[key1]))
 return pc_dic

# Calculate the movie A With the film B Pearson similarity =sum(A-A^)*sum(B-B^)/sqrt(sum[(A-A^)*(A-A^)]*sum[(B-B^)*(B-B^)])
def twoMoviesPC(join_user,movieAID,movieBID,movieA_avg,movieB_avg,ratings):
 cent_AB_sum=0.0# Similarity molecule 
 centA_sum=0.0# The denominator 
 centB_sum=0.0# The denominator 
 movieAB_pc=0.0# The movie A,B The similarity 
 count=0
 for u in range(len(join_user)):
  #print '---------',u
  count=count+1
  ratA=ratings['Rating'][ratings['UserID']==join_user[u]][ratings['MovieID']==movieAID].values[0]# User gives movie A score 
  ratB=ratings['Rating'][ratings['UserID']==join_user[u]][ratings['MovieID']==movieBID].values[0]# User gives movie B score 
  cent_AB=(ratA-movieA_avg)*(ratB-movieB_avg) # Decentralize the mean 
  centA_square=(ratA-movieA_avg)*(ratA-movieA_avg) # Take the mean squared 
  centB_square=(ratB-movieB_avg)*(ratB-movieB_avg)# Take the mean squared 
  cent_AB_sum=cent_AB_sum+cent_AB
  centA_sum=centA_sum+centA_square
  centB_sum=centB_sum+centB_square
 if(centA_sum>0 and centB_sum>0 ):
  movieAB_pc=cent_AB_sum/math.sqrt(centA_sum*centB_sum)
 return movieAB_pc

"""
 To predict the user U Be interested in those movies. points 3 Step, 
 1 ) user U In the past X A movie I saw yesterday. 
 2 ) Propose the user U Have seen the movie according to the user U Used to watch movies and compute users U Ratings of other films .
 3)  Select the highest rated movie to recommend to the user. 
 To predict the user U The movie C Score. points 3 Step :(just do this first) 
 1 ) user U In the past X A movie I saw yesterday. 
 2 ) use weighted decentralization formula to predict users U The movie C Ratings of the .

"""
# Date processing:  -3 Days, and then convert to uinxtime
def timePro(last_rat_time,UserU):
 lastDate= datetime.datetime.fromtimestamp(last_rat_time[UserU]) #unix To date, 
 date_sub3=lastDate+datetime.timedelta(days=-3)# Minus the 3 day 
 unix_sub3=time.mktime(date_sub3.timetuple())# The date to unix
 return unix_sub3

# Take user end 1 Before scoring 3 Days to assess the movie to make predictions 
def getHisRat(ratings,last_rat_time,UserUID):
 unix_sub3= timePro(last_rat_time,UserUID)
 UserU_info=ratings[ratings['UserID']==UserUID][ratings['Timestamp']>unix_sub3]
 return UserU_info

# To predict the user U The movie C Ratings of the 
def hadSeenMovieByUser(UserUID,MovieA,ratings,pc_dic,movies_mean):
 pre_rating=0.0 
 last_rat_time=ratings['Timestamp'].groupby([ratings['UserID']]).max()# Get the user U Recently, 1 Date of secondary grading 
 UserU_info= getHisRat(ratings,last_rat_time,UserUID)# Get the user U Movies I've seen in the past 

 flag=0# Represents the new movie, the user U Whether to give the movie A Play too much 
 wmv=0.0# similarity *mv The average score is the sum of the mean 
 w=0.0# Sum of similarity 
 movie_userU=UserU_info['MovieID'].values# Movies seen by current users 
 if MovieA in movie_userU:
  flag=1
  pre_rating=UserU_info['Rating'][UserU_info['MovieID']==MovieA].values
 else:
  for mv in movie_userU:
   key=str(mv)+':'+str(MovieA)
   rat_U_mv=UserU_info['Rating'][UserU_info['MovieID']==mv][UserU_info['UserID']==UserUID].values# The user U Yeah, I've seen the movie. mv Ratings of the 
   wmv=(wmv+pc_dic[key]*(rat_U_mv-movies_mean[mv]))# similarity *mv The average score is the sum of the mean 
   w=(w+pc_dic[key])# The sum of the similarities between the movie and the new movie 
   #print ('---have seen mv %d with new mv %d,%f,%f'%(mv,MovieA,wmv,w))   
  pre_rating=(movies_mean[MovieA]+wmv/w)
 print ('-flag:%d---User:%d rating movie:%d with %f score----' %(flag,UserUID,MovieA,pre_rating))
 return pre_rating,flag

if __name__=='__main__':
 all_ratings=loadData()
 movie_num=100# Control the number of movies, only movies ID The data in this range is calculated, otherwise the data volume is too large  
 ratings=all_ratings[all_ratings['MovieID']<=movie_num]

 movies_id,movies_avg_rating,movies_mean=avgRating(ratings)
 pc_dic=calculatePC(ratings)# Movie similarity matrix 
 # To predict 
 UserUID=10# The current data set is movie only 4 . 7 . 
 MovieA=6 
 pre_rating,flag=hadSeenMovieByUser(UserUID,MovieA,ratings,pc_dic,movies_mean)

 "----------------- test ID extract ------------------"
 # select UserUID
 ratings.head(10)# Once upon a time 10 Random selection in the row 1 A user ID, Such as: UserID=10
 # View which movies the user has seen in the current dataset to make it easier to select new movies (to prevent selecting movies that the user has already seen) 
 ratings[ratings['UserID']==10]# The user has only seen movies in the current data set MovieID in(4 . 7) , you can choose not to 4 . 7 In the movie ID Make a prediction, for example 6.

Operation results:

[

-flag:0---User:10 rating movie:6 with 4.115996 score----

]

Related articles: