diff --git a/lib/feature.go b/lib/feature.go index dff984a..e5171e5 100644 --- a/lib/feature.go +++ b/lib/feature.go @@ -2,6 +2,7 @@ package sabadisambiguator import ( "net/url" + "regexp" "strconv" "strings" @@ -141,7 +142,9 @@ func ExtractFeaturesWithOptions(t *twitter2.Tweet, opts ExtractOptions) FeatureV text := t.Text fv = append(fv, "BIAS") - if len(opts.ScreenNames) == 0 { + if len(opts.ScreenNames) > 0 { + text = removeScreenNames(text) + } else { fv = append(fv, "ScreenName:"+t.User.UserName) fv = append(fv, "inReplyToScreenName:"+inReplyToScreenName(t)) fv = append(fv, "screenNameInQuotedStatus:"+screenNameInQuotedStatus(t)) @@ -159,3 +162,10 @@ func ExtractFeaturesWithOptions(t *twitter2.Tweet, opts ExtractOptions) FeatureV fv = append(fv, wordsInUrlPaths(t)...) return fv } + +// https://help.twitter.com/managing-your-account/x-username-rules +var screenNamePattern = regexp.MustCompile("@[a-zA-Z0-9_]{4,50}[ \t]*") + +func removeScreenNames(s string) string { + return screenNamePattern.ReplaceAllLiteralString(s, "") +} diff --git a/lib/feature_test.go b/lib/feature_test.go new file mode 100644 index 0000000..de590fc --- /dev/null +++ b/lib/feature_test.go @@ -0,0 +1,19 @@ +package sabadisambiguator + +import ( + "testing" +) + +func TestRemoveScreenNames(t *testing.T) { + tests := map[string]string{ + "@screen text": "text", + "@screen aaa@screen2": "aaa", + ".@screen": ".", + } + for in, want := range tests { + s := removeScreenNames(in) + if s != want { + t.Errorf("removeScreenNames(%q) = %q; want %q", in, s, want) + } + } +}