-
Notifications
You must be signed in to change notification settings - Fork 1
/
tfidf_cosine_similarity.go
74 lines (62 loc) · 1.52 KB
/
tfidf_cosine_similarity.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
package lhdiff
import (
v "github.com/rexsimiloluwah/distance_metrics/vector"
"math"
"strings"
)
func TfIdfCosineSimilarity(docA string, docB string) float64 {
tokensA := strings.Fields(docA)
tokensB := strings.Fields(docB)
tokens := union(tokensA, tokensB)
n := len(tokens)
vectorA := make([]float64, n)
vectorB := make([]float64, n)
var (
allTokens []string
)
allTokens = append(allTokens, tokensA...)
allTokens = append(allTokens, tokensB...)
if len(allTokens) == 0 {
return 1
}
var documentFrequency = map[string]int{}
for _, token := range allTokens {
if documentFrequency[token] == 0 {
documentFrequency[token] = 1
} else {
documentFrequency[token] = documentFrequency[token] + 1
}
}
for k, token := range tokens {
vectorA[k] = tfidf(token, tokensA, n, documentFrequency)
vectorB[k] = tfidf(token, tokensB, n, documentFrequency)
}
similarity := v.CosineSimilarity(vectorA, vectorB)
return similarity
}
func union(a, b []string) []string {
m := make(map[string]bool)
for _, item := range a {
m[item] = true
}
for _, item := range b {
if _, ok := m[item]; !ok {
a = append(a, item)
}
}
return a
}
func tfidf(token string, tokens []string, n int, documentFrequency map[string]int) float64 {
tf := float64(count(token, tokens)) / float64(documentFrequency[token])
idf := math.Log(float64(n) / (float64(documentFrequency[token])))
return tf * idf
}
func count(key string, a []string) int {
count := 0
for _, s := range a {
if key == s {
count = count + 1
}
}
return count
}