-
Notifications
You must be signed in to change notification settings - Fork 7
Open
Description
Saw this project on reddit and thought it was really awesome. Keep up the good work, its a great a idea. Anyway, I saw that you said you were manually rejecting "false" hits which included identical tweets or transpositions of few letters and I thought to myself that could be an easy fix.
I wrote a simple abstracted implementation to remove some of the "false" hits (in javascript) of this mainly because I was too lazy to look how to implement it in python or how to integrate it with your project and submit a PR haha.
Hopefully it helps. Maybe at least it keeps you from having to think of a solution haha. Awesome project!
Carlos
//tweets would have to be stripped of things you exclude mentioned in your readme like @'s
function tooSimilar(tweet1, tweet2) {
var LD_THRESHOLD = 4, //max changes between the tweets (insertions, deletions, transpositions)
TWEET_WORD_SIMILARITY = 3, //max number of words in common
matchCount = 0;
tweet1.split(' ').map(function(word) {
//ignore words like pronouns, articles, etc.
if(IGNORED_WORDS_HASH[word] === undefined && tweet2.indexOf(word) > -1) {
matchCount++;
}
});
//lev_Dist can be expensive so we want to kick out early if possible
if(matchCount > TWEET_WORD_SIMILARITY) {
return true;
}
var lev_Dist = levenshtein(tweet1, tweet2);
return (lev_Dist < LD_THRESHOLD);
}
//credit: http://stackoverflow.com/questions/11919065/sort-an-array-by-the-levenshtein-distance-with-best-performance-in-javascript
//info: http://en.wikipedia.org/wiki/Levenshtein_distance
function levenshtein(s, t) {
var d = []; //2d matrix
// Step 1
var n = s.length;
var m = t.length;
if (n == 0) return m;
if (m == 0) return n;
//Create an array of arrays in javascript (a descending loop is quicker)
for (var i = n; i >= 0; i--) d[i] = [];
// Step 2
for (var i = n; i >= 0; i--) d[i][0] = i;
for (var j = m; j >= 0; j--) d[0][j] = j;
// Step 3
for (var i = 1; i <= n; i++) {
var s_i = s.charAt(i - 1);
// Step 4
for (var j = 1; j <= m; j++) {
//Check the jagged ld total so far
if (i == j && d[i][j] > 4) return n;
var t_j = t.charAt(j - 1);
var cost = (s_i == t_j) ? 0 : 1; // Step 5
//Calculate the minimum
var mi = d[i - 1][j] + 1;
var b = d[i][j - 1] + 1;
var c = d[i - 1][j - 1] + cost;
if (b < mi) mi = b;
if (c < mi) mi = c;
d[i][j] = mi; // Step 6
//Damerau transposition (checks for transposition of letters e.g. haet and hate have a Damerau-Lev distance
// of 1 instead of Lev distance of 2). Can be removed for optimization
if (i > 1 && j > 1 && s_i == t.charAt(j - 2) && s.charAt(i - 2) == t_j) {
d[i][j] = Math.min(d[i][j], d[i - 2][j - 2] + cost);
}
}
}
// Step 7
return d[n][m];
}
Metadata
Metadata
Assignees
Labels
No labels