# Written by Jun Wang, Jie Yang # see LICENSE.txt for license information import sys import math from random import random def cooccurrence(pref1, pref2): pref1.sort() pref2.sort() i = 0 j = 0 co = 0 size1 = len(pref1) size2 = len(pref2) if size1 == 0 or size2 == 0: return 0 while 1: if (i>= size1) or (j>=size2): break Curr_ID1 = pref1[i] Curr_ID2 = pref2[j] if Curr_ID1 < Curr_ID2 : i=i+1 elif Curr_ID1 > Curr_ID2 : j=j+1 else: co +=1 i+=1 j+=1 return co def cooccurrence2(pref1, pref2): # pref1 is sorted pref2.sort() i = 0 j = 0 co = 0 size1 = len(pref1) size2 = len(pref2) if size1 == 0 or size2 == 0: return 0 while 1: if (i>= size1) or (j>=size2): break Curr_ID1 = pref1[i] Curr_ID2 = pref2[j] if Curr_ID1 < Curr_ID2 : i=i+1 elif Curr_ID1 > Curr_ID2 : j=j+1 else: co +=1 i+=1 j+=1 return co def P2PSim(pref1, pref2): co = cooccurrence(pref1, pref2) if co == 0: return 0 normValue = math.sqrt(len(pref1)*len(pref2)) sim0 = co/normValue sim = int(sim0*1000) # use integer for bencode return sim def P2PSim2(pref1, pref2): # use cooccurrence2 co = cooccurrence2(pref1, pref2) if co == 0: return 0 normValue = math.sqrt(len(pref1)*len(pref2)) sim0 = co/normValue sim = int(sim0*1000) # use integer for bencode return sim def selectByProbability(pdf_vector, num=1, smooth=2, smooth_value=1, inplace=True): """ select a number of candidates based on their probabilities """ # Attention: pdf_vector will be changed after this call # pdf_vector: Discrete vector of the Probability Density Function # num: the number of candidates to be selected # smooth: # 0 - no smooth # 1 - always smooth # 2 - if pdf_vector contains 0, smooth # smooth_value: the extra value added to each item if smooth # return: The index list of selected items if not pdf_vector: return [] if not inplace: pdf_vector = pdf_vector[:] n = len(pdf_vector) candidates = range(n) if num >= n: return candidates if smooth == 1 or (smooth == 2 and min(pdf_vector) == 0): for i in candidates: pdf_vector[i] += smooth_value selected = [] while len(selected) < num: cdf_vector = getCDF(pdf_vector) rand = random() * max(cdf_vector) cand = bisearch(cdf_vector, rand) # bisect selected.append(candidates.pop(cand)) pdf_vector.pop(cand) return selected def getCDF(pdf_vector): cdf_vector = [] sum = 0 for i in xrange(len(pdf_vector)): if pdf_vector[i] > 0: sum += pdf_vector[i] cdf_vector.append(sum) return cdf_vector def bisearch(vector, value): # vector must be sorted low = 0 high = len(vector) while low < high: mid = (low + high) / 2 if value == vector[mid]: return mid elif value > vector[mid]: low = mid + 1 else: high = mid return low if '__main__'== __name__: def testSim(): pref1 = [1,3,6,8,9,0,2,7,5,4] pref2 = [1,2,3,4,5,6,7,8,9,0] pref3 = [1,3,5,7,9, 11, 13] pref4 = [11, 24, 25, 64] pref5 = [] pref6 = [1, 66, 77, 88, 99, 100, 11] #cand = ['111','222','333','444','555','666','777','888','999'] # for j in xrange(55000): # x = selectByProbability(pref1[:], pref1, 1) # for i in x: # print i, # print # print "****" # print pref1 # print bisearch(pref1, 3.1) # print getCDF(pref1) print cooccurrence(pref1, pref2), P2PSim(pref1, pref2) print cooccurrence(pref1, pref3), P2PSim(pref1, pref3) print cooccurrence(pref1, pref4), P2PSim(pref1, pref4) print cooccurrence(pref1, pref5), P2PSim(pref1, pref5) print cooccurrence(pref1, pref6), P2PSim(pref1, pref6) testSim()