-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimilarity_functions.py
69 lines (48 loc) · 2.21 KB
/
similarity_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# Note, this code is adapted from https://github.com/Slowika/Bag-of-Recipes
import numpy as np
def jaccard(X):
""" Function to compute Jaccard similarity between two sets of elements.
In this case, we have a list of recipes that contain each ingredient in the pair
ingredients we are evaluating.
Inputs:
X: dataframe of recipes x ingredients
"""
# Find the intersection of each ingredient pair (co-occurrences).
inters = X.T.dot(X)
# Find the union of each ingredient pair (number of recipes with either a or b or both).
union = np.add.outer(np.diag(inters), np.diag(inters)) - inters
# Check if any unions are zero.
if np.any(union == 0):
asims = np.zeros(X.shape[1])
print("Some ingredients were not present in any recipe.")
# Calculate Jaccard similarities.
jacs = inters / union
return np.array(jacs)
def asymmetric_cosine(X, alpha=0.2):
""" Function to compute asymmetric cosine similarity between two sets of elements
In this case, we have a list of recipes that contain each ingredient in the pair
ingredients we are evaluating.
Inputs:
X: dataframe of recipes x ingredients
alpha: tuning parameter, different weights to set sizes for the elements in the pair
"""
# Find the intersection of each ingredient pair (co-occurrences).
inters = X.T.dot(X)
# Find the denominator (|U(i)|^alpha * |U(j)|^(1-alpha).
denom = np.outer(np.diag(inters)**alpha, np.diag(inters)**(1 - alpha))
# Check if any unions are zero.
if np.any(denom == 0):
asims = np.zeros(X.shape[1])
print("Some ingredients were not present in any recipe.")
# Calculate asymmetric cosine similarities.
asims = inters / denom
return np.array(asims)
def pmi(X):
"""Calculate Pointwise Mutual Information (PMI) between all columns in a binary dataframe.
Inputs:
X: dataframe of recipes x ingredients
"""
cooc = X.T.dot(X) / X.shape[0] # Get co-occurrence matrix.
pmi = cooc / np.outer(np.diag(cooc), np.diag(cooc).T) # Calculate PMIs.
pmi.values[[range(pmi.shape[0])] * 2] = 0 # Set self-PMI to zero.
return np.array(pmi)