1 User-User Collaborative Filtering

1.1 Data

library(tidyverse)
user <- c("A", "B", "C", "D")
HP1 <- c(4, 5, NA, NA)
HP2 <- c(NA, 5, NA, 3)
HP3 <- c(NA, 4, NA, NA)
TW <- c(5, NA, 2, NA)
SW1 <- c(1, NA, 4, NA)
SW2 <- c(NA, NA, 5, NA)
SW3 <- c(NA, NA, NA, 3)
df1 <- data.frame( HP1, HP2, HP3, TW, SW1, SW2, SW3)
rownames(df1) <- user
df1
  HP1 HP2 HP3 TW SW1 SW2 SW3
A   4  NA  NA  5   1  NA  NA
B   5   5   4 NA  NA  NA  NA
C  NA  NA  NA  2   4   5  NA
D  NA   3  NA NA  NA  NA   3

1.2 Option 1 - Jaccard Distance

Source: https://www.geeksforgeeks.org/how-to-calculate-jaccard-similarity-in-r/#:~:text=Jaccard%20Distance%20is%20a%20measure,where%20J%20is%20Jaccard%20Similarity.

df1[is.na(df1)] <- 0
df1
  HP1 HP2 HP3 TW SW1 SW2 SW3
A   4   0   0  5   1   0   0
B   5   5   4  0   0   0   0
C   0   0   0  2   4   5   0
D   0   3   0  0   0   0   3
1-proxy::dist(df1,  method = "Jaccard")
     A    B    C
B 0.20          
C 0.50 0.00     
D 0.00 0.25 0.00

1.3 Option 2: Cosine similarity

a <- as.numeric(df1[1, ])
b <- as.numeric(df1[2, ])
c <- as.numeric(df1[3, ])
d <- as.numeric(df1[4, ])
data2 <- cbind(a, b, c, d)
data2
     a b c d
[1,] 4 5 0 0
[2,] 0 5 0 3
[3,] 0 4 0 0
[4,] 5 0 2 0
[5,] 1 0 4 0
[6,] 0 0 5 0
[7,] 0 0 0 3
#calculate Cosine Similarity
library(lsa)
Loading required package: SnowballC
cosine(data2)
          a         b         c         d
a 1.0000000 0.3798686 0.3220306 0.0000000
b 0.3798686 1.0000000 0.0000000 0.4351941
c 0.3220306 0.0000000 1.0000000 0.0000000
d 0.0000000 0.4351941 0.0000000 1.0000000

1.4 Option 3: Centered cosine similarity

a <- c(4, NA, NA, 5, 1, NA, NA)
mean_a <- mean(a, na.rm=TRUE)
aa <- a-mean_a
b <- c(5, 5, 4, NA, NA, NA, NA) 
bb <- b-mean(b, na.rm=TRUE)
c <- c(NA, NA, NA, 2, 4, 5, NA)
cc <- c-mean(c, na.rm=TRUE)
d <- c(NA, 3, NA, NA, NA, NA, 3)
dd <- d-mean(d, na.rm=TRUE)
df3 <- data.frame(aa, bb, cc, dd)
df3[is.na(df3)] <- 0
df3
          aa         bb         cc dd
1  0.6666667  0.3333333  0.0000000  0
2  0.0000000  0.3333333  0.0000000  0
3  0.0000000 -0.6666667  0.0000000  0
4  1.6666667  0.0000000 -1.6666667  0
5 -2.3333333  0.0000000  0.3333333  0
6  0.0000000  0.0000000  1.3333333  0
7  0.0000000  0.0000000  0.0000000  0
cosine(as.matrix(df3))
            aa         bb         cc  dd
aa  1.00000000 0.09245003 -0.5590852 NaN
bb  0.09245003 1.00000000  0.0000000 NaN
cc -0.55908525 0.00000000  1.0000000 NaN
dd         NaN        NaN        NaN   1
cor(as.matrix(df3))
Warning in cor(as.matrix(df3)): the standard deviation is zero
            aa           bb            cc dd
aa  1.00000000 9.245003e-02 -5.590852e-01 NA
bb  0.09245003 1.000000e+00  3.195947e-32 NA
cc -0.55908525 3.195947e-32  1.000000e+00 NA
dd          NA           NA            NA  1

Recommendation:

Option 1: Average rating

Take average rating for item 1 from the neighbourhood of \(k\) users.

Product \(i\) rating for user \(x\)

\[r_{xi} = \frac{1}{k}\sum_{j=1}^{k}r_{ji}\]

Option 2: Weighted Average Rating

\[r_{xi} = \frac{\sum_{j=1}^{k}s_{xj}r_{ji}}{\sum_{j=1}^{k}s_{xj}}\]

2 Item-item collaborative filtering

m1 <- c(1, NA, 3, NA, NA, 5, NA, NA, 5, NA, 4, NA)
m1c <- m1 - mean(m1, na.rm = TRUE)
m2 <- c(NA, NA, 5, 4, NA, NA, 4, NA, NA, 2, 1, 3)
m2c <- m2 - mean(m2, na.rm = TRUE)
m3 <- c(2, 4, NA, 1, 2, NA, 3, NA, 4, 3, 5, NA)
m3c <- m3 - mean(m3, na.rm = TRUE)
m4 <- c(NA, 2, 4, NA, 5, NA, NA, 4, NA, NA, 2, NA)
m4c <- m4 - mean(m4, na.rm = TRUE)
m5 <- c(NA, 4, 3, 4, 2, NA, NA, NA, NA, NA, 2, 5)
m5c <- m5 - mean(m5, na.rm = TRUE)
m6 <- c(1, NA, 3, NA, 3, NA, NA, 2, NA, NA, 4, NA)
m6c <- m6 - mean(m6, na.rm = TRUE)
iidf <- data.frame(m1c, m2c, m3c, m4c, m5c, m6c)
iidf
    m1c        m2c m3c  m4c        m5c  m6c
1  -2.6         NA  -1   NA         NA -1.6
2    NA         NA   1 -1.4  0.6666667   NA
3  -0.6  1.8333333  NA  0.6 -0.3333333  0.4
4    NA  0.8333333  -2   NA  0.6666667   NA
5    NA         NA  -1  1.6 -1.3333333  0.4
6   1.4         NA  NA   NA         NA   NA
7    NA  0.8333333   0   NA         NA   NA
8    NA         NA  NA  0.6         NA -0.6
9   1.4         NA   1   NA         NA   NA
10   NA -1.1666667   0   NA         NA   NA
11  0.4 -2.1666667   2 -1.4 -1.3333333  1.4
12   NA -0.1666667  NA   NA  1.6666667   NA
iidf[is.na(iidf)] <- 0
iidf
    m1c        m2c m3c  m4c        m5c  m6c
1  -2.6  0.0000000  -1  0.0  0.0000000 -1.6
2   0.0  0.0000000   1 -1.4  0.6666667  0.0
3  -0.6  1.8333333   0  0.6 -0.3333333  0.4
4   0.0  0.8333333  -2  0.0  0.6666667  0.0
5   0.0  0.0000000  -1  1.6 -1.3333333  0.4
6   1.4  0.0000000   0  0.0  0.0000000  0.0
7   0.0  0.8333333   0  0.0  0.0000000  0.0
8   0.0  0.0000000   0  0.6  0.0000000 -0.6
9   1.4  0.0000000   1  0.0  0.0000000  0.0
10  0.0 -1.1666667   0  0.0  0.0000000  0.0
11  0.4 -2.1666667   2 -1.4 -1.3333333  1.4
12  0.0 -0.1666667   0  0.0  1.6666667  0.0
cosine(as.matrix(iidf))
            m1c        m2c        m3c        m4c         m5c        m6c
m1c  1.00000000 -0.1785421  0.4140393 -0.1024501 -0.03678062  0.5870395
m2c -0.17854212  1.0000000 -0.5262348  0.4680078  0.28671708 -0.3064398
m3c  0.41403934 -0.5262348  1.0000000 -0.6239807 -0.21320072  0.5063697
m4c -0.10245014  0.4680078 -0.6239807  1.0000000 -0.19266866 -0.2353394
m5c -0.03678062  0.2867171 -0.2132007 -0.1926687  1.00000000 -0.4102418
m6c  0.58703951 -0.3064398  0.5063697 -0.2353394 -0.41024184  1.0000000
cor(as.matrix(iidf))
            m1c        m2c        m3c        m4c         m5c        m6c
m1c  1.00000000 -0.1785421  0.4140393 -0.1024501 -0.03678062  0.5870395
m2c -0.17854212  1.0000000 -0.5262348  0.4680078  0.28671708 -0.3064398
m3c  0.41403934 -0.5262348  1.0000000 -0.6239807 -0.21320072  0.5063697
m4c -0.10245014  0.4680078 -0.6239807  1.0000000 -0.19266866 -0.2353394
m5c -0.03678062  0.2867171 -0.2132007 -0.1926687  1.00000000 -0.4102418
m6c  0.58703951 -0.3064398  0.5063697 -0.2353394 -0.41024184  1.0000000

Suppose we want to give a prediction to Movie 1 and User 5

We need to find movies that are highest similarity to movie 1 that are also rated by user 5.

Movie 3 and Movie 6 are the neighbourhood movie for movie 1.

iidf2 <- data.frame(m1, m2, m3, m4, m5, m6)
iidf2
   m1 m2 m3 m4 m5 m6
1   1 NA  2 NA NA  1
2  NA NA  4  2  4 NA
3   3  5 NA  4  3  3
4  NA  4  1 NA  4 NA
5  NA NA  2  5  2  3
6   5 NA NA NA NA NA
7  NA  4  3 NA NA NA
8  NA NA NA  4 NA  2
9   5 NA  4 NA NA NA
10 NA  2  3 NA NA NA
11  4  1  5  2  2  4
12 NA  3 NA NA  5 NA

\(s_{13} = 0.41\) and \(s_{16}=0.5\).

Rating for movie 1 for user 5 is

\[=\frac{(0.41 \times 2) + (0.59 \times 3)}{0.41 + 0.59} = 2.6\] For movie 1 user 5 predicted rating is 2.6.