library(tidyverse)
user <- c("A", "B", "C", "D")
HP1 <- c(4, 5, NA, NA)
HP2 <- c(NA, 5, NA, 3)
HP3 <- c(NA, 4, NA, NA)
TW <- c(5, NA, 2, NA)
SW1 <- c(1, NA, 4, NA)
SW2 <- c(NA, NA, 5, NA)
SW3 <- c(NA, NA, NA, 3)
df1 <- data.frame( HP1, HP2, HP3, TW, SW1, SW2, SW3)
rownames(df1) <- user
df1
HP1 HP2 HP3 TW SW1 SW2 SW3
A 4 NA NA 5 1 NA NA
B 5 5 4 NA NA NA NA
C NA NA NA 2 4 5 NA
D NA 3 NA NA NA NA 3
Source: https://www.geeksforgeeks.org/how-to-calculate-jaccard-similarity-in-r/#:~:text=Jaccard%20Distance%20is%20a%20measure,where%20J%20is%20Jaccard%20Similarity.
df1[is.na(df1)] <- 0
df1
HP1 HP2 HP3 TW SW1 SW2 SW3
A 4 0 0 5 1 0 0
B 5 5 4 0 0 0 0
C 0 0 0 2 4 5 0
D 0 3 0 0 0 0 3
1-proxy::dist(df1, method = "Jaccard")
A B C
B 0.20
C 0.50 0.00
D 0.00 0.25 0.00
a <- as.numeric(df1[1, ])
b <- as.numeric(df1[2, ])
c <- as.numeric(df1[3, ])
d <- as.numeric(df1[4, ])
data2 <- cbind(a, b, c, d)
data2
a b c d
[1,] 4 5 0 0
[2,] 0 5 0 3
[3,] 0 4 0 0
[4,] 5 0 2 0
[5,] 1 0 4 0
[6,] 0 0 5 0
[7,] 0 0 0 3
#calculate Cosine Similarity
library(lsa)
Loading required package: SnowballC
cosine(data2)
a b c d
a 1.0000000 0.3798686 0.3220306 0.0000000
b 0.3798686 1.0000000 0.0000000 0.4351941
c 0.3220306 0.0000000 1.0000000 0.0000000
d 0.0000000 0.4351941 0.0000000 1.0000000
a <- c(4, NA, NA, 5, 1, NA, NA)
mean_a <- mean(a, na.rm=TRUE)
aa <- a-mean_a
b <- c(5, 5, 4, NA, NA, NA, NA)
bb <- b-mean(b, na.rm=TRUE)
c <- c(NA, NA, NA, 2, 4, 5, NA)
cc <- c-mean(c, na.rm=TRUE)
d <- c(NA, 3, NA, NA, NA, NA, 3)
dd <- d-mean(d, na.rm=TRUE)
df3 <- data.frame(aa, bb, cc, dd)
df3[is.na(df3)] <- 0
df3
aa bb cc dd
1 0.6666667 0.3333333 0.0000000 0
2 0.0000000 0.3333333 0.0000000 0
3 0.0000000 -0.6666667 0.0000000 0
4 1.6666667 0.0000000 -1.6666667 0
5 -2.3333333 0.0000000 0.3333333 0
6 0.0000000 0.0000000 1.3333333 0
7 0.0000000 0.0000000 0.0000000 0
cosine(as.matrix(df3))
aa bb cc dd
aa 1.00000000 0.09245003 -0.5590852 NaN
bb 0.09245003 1.00000000 0.0000000 NaN
cc -0.55908525 0.00000000 1.0000000 NaN
dd NaN NaN NaN 1
cor(as.matrix(df3))
Warning in cor(as.matrix(df3)): the standard deviation is zero
aa bb cc dd
aa 1.00000000 9.245003e-02 -5.590852e-01 NA
bb 0.09245003 1.000000e+00 3.195947e-32 NA
cc -0.55908525 3.195947e-32 1.000000e+00 NA
dd NA NA NA 1
Recommendation:
Option 1: Average rating
Take average rating for item 1 from the neighbourhood of \(k\) users.
Product \(i\) rating for user \(x\)
\[r_{xi} = \frac{1}{k}\sum_{j=1}^{k}r_{ji}\]
Option 2: Weighted Average Rating
\[r_{xi} = \frac{\sum_{j=1}^{k}s_{xj}r_{ji}}{\sum_{j=1}^{k}s_{xj}}\]
m1 <- c(1, NA, 3, NA, NA, 5, NA, NA, 5, NA, 4, NA)
m1c <- m1 - mean(m1, na.rm = TRUE)
m2 <- c(NA, NA, 5, 4, NA, NA, 4, NA, NA, 2, 1, 3)
m2c <- m2 - mean(m2, na.rm = TRUE)
m3 <- c(2, 4, NA, 1, 2, NA, 3, NA, 4, 3, 5, NA)
m3c <- m3 - mean(m3, na.rm = TRUE)
m4 <- c(NA, 2, 4, NA, 5, NA, NA, 4, NA, NA, 2, NA)
m4c <- m4 - mean(m4, na.rm = TRUE)
m5 <- c(NA, 4, 3, 4, 2, NA, NA, NA, NA, NA, 2, 5)
m5c <- m5 - mean(m5, na.rm = TRUE)
m6 <- c(1, NA, 3, NA, 3, NA, NA, 2, NA, NA, 4, NA)
m6c <- m6 - mean(m6, na.rm = TRUE)
iidf <- data.frame(m1c, m2c, m3c, m4c, m5c, m6c)
iidf
m1c m2c m3c m4c m5c m6c
1 -2.6 NA -1 NA NA -1.6
2 NA NA 1 -1.4 0.6666667 NA
3 -0.6 1.8333333 NA 0.6 -0.3333333 0.4
4 NA 0.8333333 -2 NA 0.6666667 NA
5 NA NA -1 1.6 -1.3333333 0.4
6 1.4 NA NA NA NA NA
7 NA 0.8333333 0 NA NA NA
8 NA NA NA 0.6 NA -0.6
9 1.4 NA 1 NA NA NA
10 NA -1.1666667 0 NA NA NA
11 0.4 -2.1666667 2 -1.4 -1.3333333 1.4
12 NA -0.1666667 NA NA 1.6666667 NA
iidf[is.na(iidf)] <- 0
iidf
m1c m2c m3c m4c m5c m6c
1 -2.6 0.0000000 -1 0.0 0.0000000 -1.6
2 0.0 0.0000000 1 -1.4 0.6666667 0.0
3 -0.6 1.8333333 0 0.6 -0.3333333 0.4
4 0.0 0.8333333 -2 0.0 0.6666667 0.0
5 0.0 0.0000000 -1 1.6 -1.3333333 0.4
6 1.4 0.0000000 0 0.0 0.0000000 0.0
7 0.0 0.8333333 0 0.0 0.0000000 0.0
8 0.0 0.0000000 0 0.6 0.0000000 -0.6
9 1.4 0.0000000 1 0.0 0.0000000 0.0
10 0.0 -1.1666667 0 0.0 0.0000000 0.0
11 0.4 -2.1666667 2 -1.4 -1.3333333 1.4
12 0.0 -0.1666667 0 0.0 1.6666667 0.0
cosine(as.matrix(iidf))
m1c m2c m3c m4c m5c m6c
m1c 1.00000000 -0.1785421 0.4140393 -0.1024501 -0.03678062 0.5870395
m2c -0.17854212 1.0000000 -0.5262348 0.4680078 0.28671708 -0.3064398
m3c 0.41403934 -0.5262348 1.0000000 -0.6239807 -0.21320072 0.5063697
m4c -0.10245014 0.4680078 -0.6239807 1.0000000 -0.19266866 -0.2353394
m5c -0.03678062 0.2867171 -0.2132007 -0.1926687 1.00000000 -0.4102418
m6c 0.58703951 -0.3064398 0.5063697 -0.2353394 -0.41024184 1.0000000
cor(as.matrix(iidf))
m1c m2c m3c m4c m5c m6c
m1c 1.00000000 -0.1785421 0.4140393 -0.1024501 -0.03678062 0.5870395
m2c -0.17854212 1.0000000 -0.5262348 0.4680078 0.28671708 -0.3064398
m3c 0.41403934 -0.5262348 1.0000000 -0.6239807 -0.21320072 0.5063697
m4c -0.10245014 0.4680078 -0.6239807 1.0000000 -0.19266866 -0.2353394
m5c -0.03678062 0.2867171 -0.2132007 -0.1926687 1.00000000 -0.4102418
m6c 0.58703951 -0.3064398 0.5063697 -0.2353394 -0.41024184 1.0000000
Suppose we want to give a prediction to Movie 1 and User 5
We need to find movies that are highest similarity to movie 1 that are also rated by user 5.
Movie 3 and Movie 6 are the neighbourhood movie for movie 1.
iidf2 <- data.frame(m1, m2, m3, m4, m5, m6)
iidf2
m1 m2 m3 m4 m5 m6
1 1 NA 2 NA NA 1
2 NA NA 4 2 4 NA
3 3 5 NA 4 3 3
4 NA 4 1 NA 4 NA
5 NA NA 2 5 2 3
6 5 NA NA NA NA NA
7 NA 4 3 NA NA NA
8 NA NA NA 4 NA 2
9 5 NA 4 NA NA NA
10 NA 2 3 NA NA NA
11 4 1 5 2 2 4
12 NA 3 NA NA 5 NA
\(s_{13} = 0.41\) and \(s_{16}=0.5\).
Rating for movie 1 for user 5 is
\[=\frac{(0.41 \times 2) + (0.59 \times 3)}{0.41 + 0.59} = 2.6\] For movie 1 user 5 predicted rating is 2.6.