# ---------------------------------------------------------------------- # # Ruby adaptations of the Python code found in Toby Segaran's # Programming Collective Intelligence book. # # steven.romej @ gmail (4 may 08) # ---------------------------------------------------------------------- class Recommendations # ------------------------------------------------- # Euclidean distance # ------------------------------------------------- # Returns a distance-based similarity score for person1 and person2 def sim_distance( prefs , person1 , person2 ) # Get the list of shared_items si = {} for item in prefs[person1].keys if prefs[person2].include? item si[item] = 1 end end # if they have no ratings in common, return 0 return 0 if si.length == 0 squares = [] for item in prefs[person1].keys if prefs[person2].include? item squares << (prefs[person1][item] - prefs[person2][item]) ** 2 end end sum_of_squares = squares.inject { |sum,value| sum += value } return 1/(1 + sum_of_squares) end # ------------------------------------------------- # Pearson score # ------------------------------------------------- # Returns the Pearson correlation coefficient for p1 and p2 def sim_pearson( prefs, p1, p2) # Get the list of mutually rated items si = {} for item in prefs[p1].keys si[item] = 1 if prefs[p2].include? item end # Find the number of elements n = si.length # If there are no ratings in common, return 0 return 0 if n == 0 # Add up all the preferences sum1 = si.keys.inject(0) { |sum,value| sum += prefs[p1][value] } sum2 = si.keys.inject(0) { |sum,value| sum += prefs[p2][value] } # Sum up the squares sum1Sq = si.keys.inject(0) { |sum,value| sum += prefs[p1][value] ** 2 } sum2Sq = si.keys.inject(0) { |sum,value| sum += prefs[p2][value] ** 2 } # Sum up the products pSum = si.keys.inject(0) { |sum,value| sum += (prefs[p1][value] * prefs[p2][value])} # Calculate the Pearson score num = pSum - (sum1*sum2/n) den = Math.sqrt((sum1Sq - (sum1 ** 2)/n) * (sum2Sq - (sum2 ** 2)/n)) return 0 if den == 0 r = num / den end # Ranking the critics # TODO lacks the score-function-as-parameter aspect of original def topMatches( prefs, person, n=5, scorefunc = :sim_pearson ) scores = [] for other in prefs.keys if scorefunc == :sim_pearson scores << [ sim_pearson(prefs,person,other), other] if other != person else scores << [ sim_distance(prefs,person,other), other] if other != person end end return scores.sort.reverse.slice(0,n) end # Gets recommendations for a person by using a weighted average # of every other user's rankings # TODO just uses sim_pearson and not a function as parameter def getRecommendations(prefs, person, scorefunc = :sim_pearson ) totals = {} simSums = {} for other in prefs.keys # don't compare me to myself next if other == person if scorefunc == :sim_pearson sim = sim_pearson( prefs, person, other) else sim = sim_distance( prefs, person, other) end # ignore scores of zero or lower next if sim <= 0 for item in prefs[other].keys # only score movies I haven't seen yet if !prefs[person].include? item or prefs[person][item] == 0 # similarity * score totals.default = 0 totals[item] += prefs[other][item] * sim # sum of similarities simSums.default = 0 simSums[item] += sim end end end # Create a normalized list rankings = [] totals.each do |item,total| rankings << [total/simSums[item], item] end # Return the sorted list return rankings.sort.reverse end def transformPrefs( prefs ) result = {} for person in prefs.keys for item in prefs[person].keys result[item] = {} if result[item] == nil # Flip item and person result[item][person] = prefs[person][item] end end return result end def calculateSimilarItems( prefs, n = 10 ) # Create a dictionary of items showing which other items they are most similar to result = {} # Invert the preference matrix to be item-centric itemPrefs = transformPrefs(prefs) c = 0 for item in itemPrefs.keys # Status updates for large datasets c += 1 puts "#{c}/#{itemPrefs.length}" if c % 100 == 0 # Find the most similar items to this one scores = topMatches(itemPrefs, item, n, :sim_distance) result[item] = scores end return result end def getRecommendedItems( prefs, itemMatch, user) userRatings = prefs[user] scores = {} totalSim = {} # Loop over items rated by this user userRatings.each do |item,rating| itemMatch[item].each do |similarity,item2| # Ignore if this user has already rated this item next if userRatings.include? item2 # Weighted sum of rating times similarity scores[item2] = 0 if scores[item2] == nil scores[item2] += similarity * rating # Sum of all the similarities totalSim[item2] = 0 if totalSim[item2] == nil totalSim[item2] += similarity end end # Divide each total score by total weighting to get an average rankings = [] scores.each do |item,score| rankings << [score/totalSim[item], item] end return rankings.sort.reverse end def loadMovieLens( path = "ml-data" ) movies = {} File.open(path + "/u.item") do |file| while !file.eof? (id,title) = file.readline.split("|")[0,2] movies[id] = title end end prefs = {} File.open(path + "/u.data") do |file| while !file.eof? (user,movieid,rating,ts) = file.readline.split("\t") prefs[user] = {} if prefs[user] == nil prefs[user][movies[movieid]] = rating.to_f end end return prefs end end # ---------------------------------------------------------------------- # A simple class that implements the necessary pydelicious functions # Sleeps 1 second after each request to prevent 503 errors # ---------------------------------------------------------------------- require 'net/http' require 'rexml/document' require 'digest/md5' module Delicious # Get a list of popular urls (title and link) def get_popular( tag = "" ) popular = [] url = "http://del.icio.us/rss/popular/#{tag}" response = Net::HTTP.get_response(URI.parse(url)).body doc = REXML::Document.new(response) doc.elements.each("//item") do |item| popular << { "title" => item.elements["title"].text , "href" => item.elements["link"].text } end sleep 1 return popular end # Get a list of users that posted the url def get_urlposts( url ) urlposts = [] urlcode = Digest::MD5.hexdigest(url) url = "http://feeds.delicious.com/rss/url/#{urlcode}" response = Net::HTTP.get_response(URI.parse(url)).body doc = REXML::Document.new(response) doc.elements.each("//item") do |item| urlposts << { "user" => item.elements["dc:creator"].text } end sleep 1 return urlposts end # Get a list of urls by username def get_userposts( user ) posts = [] url = "http://feeds.delicious.com/rss/#{user}" response = Net::HTTP.get_response(URI.parse(url)).body doc = REXML::Document.new(response) doc.elements.each("//item") do |item| posts << { "href" => item.elements["link"].text } end sleep 1 return posts end end # ---------------------------------------------------------------------- # A del.icio.us link recommendation engine # ---------------------------------------------------------------------- class DeliciousRec include Delicious # returns a dictionary of users, each pointing to empty hash def initializeUserDict( tag, count = 1 ) user_dict = {} # get the top 'count' popular posts for p1 in get_popular(tag)[0,count] # find all users who posted this for p2 in get_urlposts(p1["href"]) user = p2["user"] user_dict[user] = {} end end return user_dict end def fillItems( user_dict ) all_items = {} # Find links posted by all users for user in user_dict.keys for attempt in 1..3 begin puts "fetching for #{user}" posts = get_userposts(user) puts "fetched posts for #{user}" sleep 2 break rescue puts "Failed user #{user}, retrying" sleep 4 end end for post in posts url = post["href"] user_dict[user][url] = 1 all_items[url] = 1 end end # Fill in missing items with 0 for ratings in user_dict.values for item in all_items.keys ratings[item] = 0 if !ratings.include? item end end end end class App def run # A dictionary of movie critics and their ratings of a small # set of movies critics = {'Lisa Rose'=> {'Lady in the Water'=> 2.5, 'Snakes on a Plane'=> 3.5, 'Just My Luck'=> 3.0, 'Superman Returns'=> 3.5, 'You, Me and Dupree'=> 2.5, 'The Night Listener'=> 3.0}, 'Gene Seymour'=> {'Lady in the Water'=> 3.0, 'Snakes on a Plane'=> 3.5, 'Just My Luck'=> 1.5, 'Superman Returns'=> 5.0, 'The Night Listener'=> 3.0, 'You, Me and Dupree'=> 3.5}, 'Michael Phillips'=> {'Lady in the Water'=> 2.5, 'Snakes on a Plane'=> 3.0, 'Superman Returns'=> 3.5, 'The Night Listener'=> 4.0}, 'Claudia Puig'=> {'Snakes on a Plane'=> 3.5, 'Just My Luck'=> 3.0, 'The Night Listener'=> 4.5, 'Superman Returns'=> 4.0, 'You, Me and Dupree'=> 2.5}, 'Mick LaSalle'=> {'Lady in the Water'=> 3.0, 'Snakes on a Plane'=> 4.0, 'Just My Luck'=> 2.0, 'Superman Returns'=> 3.0, 'The Night Listener'=> 3.0, 'You, Me and Dupree'=> 2.0}, 'Jack Matthews'=> {'Lady in the Water'=> 3.0, 'Snakes on a Plane'=> 4.0, 'The Night Listener'=> 3.0, 'Superman Returns'=> 5.0, 'You, Me and Dupree'=> 3.5}, 'Toby'=> {'Snakes on a Plane'=>4.5,'You, Me and Dupree'=>1.0,'Superman Returns'=>4.0} } recommendations = Recommendations.new # Test the Euclidean score code puts "The Euclidean distance score is #{recommendations.sim_distance( critics , "Lisa Rose", "Gene Seymour")}" # Test the Pearson score puts "The Pearson score is #{recommendations.sim_pearson( critics , "Lisa Rose" , "Gene Seymour" )}" # Test the topMatches puts recommendations.topMatches(critics,"Toby", 3) # Try getting recommendations puts recommendations.getRecommendations(critics,"Toby") # Transform the preferences, get recommendation movies = recommendations.transformPrefs( critics ) puts recommendations.topMatches( movies , "Superman Returns") itemsim = recommendations.calculateSimilarItems(critics) puts itemsim puts recommendations.getRecommendedItems(critics, itemsim, "Toby") # ------------------------------------------------------------------- # del.icio.us examples delicious = DeliciousRec.new delusers = delicious.initializeUserDict("programming") delicious.fillItems(delusers) # pick a user at random user = delusers.keys[rand(delusers.length - 1)] puts "user is #{user}" puts recommendations.topMatches(delusers,user) puts recommendations.getRecommendations(delusers,user)[0,10] url = recommendations.getRecommendations(delusers,user)[0][1] puts "the url is #{url}" puts recommendations.topMatches(recommendations.transformPrefs(delusers),url) # -------------------------------------------------------------------- # Movie Lens examples ##prefs = recommendations.loadMovieLens() #puts prefs["87"] #puts recommendations.getRecommendations(prefs, "87")[0,30] ##itemsim = recommendations.calculateSimilarItems(prefs, n=50) ##puts recommendations.getRecommendedItems(prefs, itemsim, "87")[0,30] end end app = App.new app.run