Skip to content

Commit

Permalink
Starting working on gersam
Browse files Browse the repository at this point in the history
  • Loading branch information
Yomguithereal committed Mar 19, 2020
1 parent a7b072a commit f38f1f6
Show file tree
Hide file tree
Showing 6 changed files with 159 additions and 2 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ experiments
build
*.log
*.lock
*.pl
.DS_Store
TODO.md

Expand Down
115 changes: 115 additions & 0 deletions src/tokenizers/words/gersam.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 0 additions & 2 deletions src/tokenizers/words/treebank.js
Original file line number Diff line number Diff line change
Expand Up @@ -125,5 +125,3 @@ export default function tokenize(text) {

return finalize(text);
}

// FRENCH aujourd'hui & weird apostrophe
1 change: 1 addition & 0 deletions test/endpoint.js
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ describe('tokenizers', function() {
});

describe('words', function() {
require('./tokenizers/words/gersam.js');
require('./tokenizers/words/treebank.js');
});
});
38 changes: 38 additions & 0 deletions test/tokenizers/words/gersam.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/**
* Talisman tokenizers/words/gersam tests
* =======================================
*
*/
import assert from 'assert';
import words from '../../../src/tokenizers/words/gersam';

describe('gersam', function() {
it('should correctly tokenize words.', function() {
const tests = [
{
lang: 'en',
text: 'Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks.',
tokens: ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
},
{
lang: 'en',
text: 'They\'ll save and invest more.',
tokens: ['They', '\'ll', 'save', 'and', 'invest', 'more', '.']
},
{
lang: 'en',
text: 'hi, my name can\'t hello,',
tokens: ['hi', ',', 'my', 'name', 'can', '\'t', 'hello', ',']
},
{
lang: 'en',
text: '"Hello", Good sir (this is appaling)...',
tokens: ['"', 'Hello', '"', ',', 'Good', 'sir', '(', 'this', 'is', 'appaling', ')', '…']
}
];

tests.forEach(function({lang, text, tokens}) {
assert.deepEqual(words(lang, text), tokens);
});
});
});
4 changes: 4 additions & 0 deletions test/tokenizers/words/treebank.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ describe('treebank', function() {
text: 'hi, my name can\'t hello,',
tokens: ['hi', ',', 'my', 'name', 'ca', "n't", 'hello', ',']
},
{
text: 'O.N.U.',
tokens: ['O.N.U', '.']
},
{
text: '"Hello", Good sir (this is appaling)...',
tokens: ['``', 'Hello', '\'\'', ',', 'Good', 'sir', '(', 'this', 'is', 'appaling', ')', '...']
Expand Down

0 comments on commit f38f1f6

Please sign in to comment.