diff --git a/.gitignore b/.gitignore index 0e054553..1e58a4fd 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ examples_files/ **/examples.rst _build/ v2.1/docs/engine_files/ +build build/ Pipfile Pipfile.lock diff --git a/v2.1/docs/reference_manual/operators/String operators/String distances/content.rst b/v2.1/docs/reference_manual/operators/String operators/String distances/content.rst new file mode 100644 index 00000000..fe88619b --- /dev/null +++ b/v2.1/docs/reference_manual/operators/String operators/String distances/content.rst @@ -0,0 +1,66 @@ +------ +Syntax +------ + + **levenshtein (** op1 , op2 **)** + +---------------- +Input parameters +---------------- +.. list-table:: + + * - op1, op2 + - the operands + + +------------------------------------ +Examples of valid syntaxes +------------------------------------ +.. code-block:: + + levenshtein(DS_1, DS_2) + levenshtein("foo", "bar") + +------------------------------------ +Semantics for scalar operations +------------------------------------ +Levenshtein distance is a string metric for measuring the difference between two sequences. + +For example: + +| ``levenshtein("foo", "bar")`` gives ``3`` +| ``levenshtein("foo", "")`` gives ``3`` +| ``levenshtein("foo", "foo")`` gives ``0`` +| ``levenshtein("bar", "baz")`` gives ``1`` + +----------------------------- +Input parameters type +----------------------------- +op1, op2 :: + + dataset { measure _+ } + | component + | string + +----------------------------- +Result type +----------------------------- +result :: + + dataset { measure _+ } + | component + | integer + +----------------------------- +Additional Constraints +----------------------------- +Parameters cannot be omitted. + +--------- +Behaviour +--------- + +As for the invocations at Data Set level, the operator has the behaviour of the “Operators applicable on one Scalar Value +or Data Set or Data Set Component”. As for the invocations at Component or Scalar level, the operator has the behaviour +of the “Operators applicable on more than two Scalar Values or Data Set Components” +(see the section “Typical behaviours of the ML Operators”). diff --git a/v2.1/docs/reference_manual/operators/String operators/String distances/examples/ds_1.csv b/v2.1/docs/reference_manual/operators/String operators/String distances/examples/ds_1.csv new file mode 100644 index 00000000..e3f96916 --- /dev/null +++ b/v2.1/docs/reference_manual/operators/String operators/String distances/examples/ds_1.csv @@ -0,0 +1,5 @@ +Id_1,Id_2,Me_1,Me_2 +1,A,"hello world","hello" +2,A,"say hello","hello" +3,A,"he","hello" +4,A,"hello!","hello" \ No newline at end of file diff --git a/v2.1/docs/reference_manual/operators/String operators/String distances/examples/ds_1.json b/v2.1/docs/reference_manual/operators/String operators/String distances/examples/ds_1.json new file mode 100644 index 00000000..cde7895b --- /dev/null +++ b/v2.1/docs/reference_manual/operators/String operators/String distances/examples/ds_1.json @@ -0,0 +1,24 @@ +{ + "name": "DS_1", + "components": [ + { + "name": "Id_1", + "role": "Identifier", + "data_type": "Integer" + }, + { + "name": "Id_2", + "role": "Identifier", + "data_type": "String" + }, + { + "name": "Me_1", + "role": "Measure", + "data_type": "String" + }, + { + "name": "Me_2", + "role": "Measure", + "data_type": "String" + } + ]} \ No newline at end of file diff --git a/v2.1/docs/reference_manual/operators/String operators/String distances/examples/ds_2.csv b/v2.1/docs/reference_manual/operators/String operators/String distances/examples/ds_2.csv new file mode 100644 index 00000000..19b5ca72 --- /dev/null +++ b/v2.1/docs/reference_manual/operators/String operators/String distances/examples/ds_2.csv @@ -0,0 +1,5 @@ +Id_1,Id_2,Me_1,Me_2 +1,A,"hi world","hello" +2,A,"say hi","hello" +3,A,"he","hello" +4,A,"hi!","hello" \ No newline at end of file diff --git a/v2.1/docs/reference_manual/operators/String operators/String distances/examples/ds_2.json b/v2.1/docs/reference_manual/operators/String operators/String distances/examples/ds_2.json new file mode 100644 index 00000000..cde7895b --- /dev/null +++ b/v2.1/docs/reference_manual/operators/String operators/String distances/examples/ds_2.json @@ -0,0 +1,24 @@ +{ + "name": "DS_1", + "components": [ + { + "name": "Id_1", + "role": "Identifier", + "data_type": "Integer" + }, + { + "name": "Id_2", + "role": "Identifier", + "data_type": "String" + }, + { + "name": "Me_1", + "role": "Measure", + "data_type": "String" + }, + { + "name": "Me_2", + "role": "Measure", + "data_type": "String" + } + ]} \ No newline at end of file diff --git a/v2.1/docs/reference_manual/operators/String operators/String distances/examples/ex_1.csv b/v2.1/docs/reference_manual/operators/String operators/String distances/examples/ex_1.csv new file mode 100644 index 00000000..cd106e28 --- /dev/null +++ b/v2.1/docs/reference_manual/operators/String operators/String distances/examples/ex_1.csv @@ -0,0 +1,5 @@ +Id_1,Id_2,Me_1,Me_2 +1,A,4,0 +2,A,4,0 +3,A,0,0 +4,A,4,0 \ No newline at end of file diff --git a/v2.1/docs/reference_manual/operators/String operators/String distances/examples/ex_1.json b/v2.1/docs/reference_manual/operators/String operators/String distances/examples/ex_1.json new file mode 100644 index 00000000..76390a6f --- /dev/null +++ b/v2.1/docs/reference_manual/operators/String operators/String distances/examples/ex_1.json @@ -0,0 +1,24 @@ +{ + "name": "DS_r", + "components": [ + { + "name": "Id_1", + "role": "Identifier", + "data_type": "Integer" + }, + { + "name": "Id_2", + "role": "Identifier", + "data_type": "String" + }, + { + "name": "Me_1", + "role": "Measure", + "data_type": "Integer" + }, + { + "name": "Me_2", + "role": "Measure", + "data_type": "Integer" + } + ]} \ No newline at end of file diff --git a/v2.1/docs/reference_manual/operators/String operators/String distances/examples/ex_1.vtl b/v2.1/docs/reference_manual/operators/String operators/String distances/examples/ex_1.vtl new file mode 100644 index 00000000..8e0078d8 --- /dev/null +++ b/v2.1/docs/reference_manual/operators/String operators/String distances/examples/ex_1.vtl @@ -0,0 +1 @@ +DS_r := levenshtein(DS_1, DS_2); \ No newline at end of file diff --git a/v2.1/docs/reference_manual/operators/String operators/String distances/examples/ex_2.csv b/v2.1/docs/reference_manual/operators/String operators/String distances/examples/ex_2.csv new file mode 100644 index 00000000..8ae0105c --- /dev/null +++ b/v2.1/docs/reference_manual/operators/String operators/String distances/examples/ex_2.csv @@ -0,0 +1,5 @@ +Id_1,Id_2,Me_1,Me_2,delta +1,A,"hello world","hello",6 +2,A,"say hello","hello",4 +3,A,"he","hello",3 +4,A,"hello!","hello",1 \ No newline at end of file diff --git a/v2.1/docs/reference_manual/operators/String operators/String distances/examples/ex_2.json b/v2.1/docs/reference_manual/operators/String operators/String distances/examples/ex_2.json new file mode 100644 index 00000000..2cd58417 --- /dev/null +++ b/v2.1/docs/reference_manual/operators/String operators/String distances/examples/ex_2.json @@ -0,0 +1,29 @@ +{ + "name": "DS_r", + "components": [ + { + "name": "Id_1", + "role": "Identifier", + "data_type": "Integer" + }, + { + "name": "Id_2", + "role": "Identifier", + "data_type": "String" + }, + { + "name": "Me_1", + "role": "Measure", + "data_type": "String" + }, + { + "name": "Me_2", + "role": "Measure", + "data_type": "String" + }, + { + "name": "delta", + "role": "Measure", + "data_type": "Integer" + } + ]} \ No newline at end of file diff --git a/v2.1/docs/reference_manual/operators/String operators/String distances/examples/ex_2.vtl b/v2.1/docs/reference_manual/operators/String operators/String distances/examples/ex_2.vtl new file mode 100644 index 00000000..c21ffbc1 --- /dev/null +++ b/v2.1/docs/reference_manual/operators/String operators/String distances/examples/ex_2.vtl @@ -0,0 +1 @@ +DS_r := DS_1 [calc delta := levenshtein (Me_1, Me_2)]; \ No newline at end of file diff --git a/v2.1/docs/reference_manual/operators/String operators/String distances/examples/intro.rst b/v2.1/docs/reference_manual/operators/String operators/String distances/examples/intro.rst new file mode 100644 index 00000000..afbd4863 --- /dev/null +++ b/v2.1/docs/reference_manual/operators/String operators/String distances/examples/intro.rst @@ -0,0 +1 @@ +Given the operand dataset DS_1: \ No newline at end of file diff --git a/v2.1/docs/reference_manual/operators/String operators/String distances/index.rst b/v2.1/docs/reference_manual/operators/String operators/String distances/index.rst new file mode 100644 index 00000000..6576f23b --- /dev/null +++ b/v2.1/docs/reference_manual/operators/String operators/String distances/index.rst @@ -0,0 +1,5 @@ +============================================================================================= +String distances: `levenshtein` +============================================================================================= +.. include:: ./content.rst +.. include:: ./examples.rst \ No newline at end of file diff --git a/v2.1/docs/reference_manual/operators/String operators/index.rst b/v2.1/docs/reference_manual/operators/String operators/index.rst index 62f06219..bfd546a4 100644 --- a/v2.1/docs/reference_manual/operators/String operators/index.rst +++ b/v2.1/docs/reference_manual/operators/String operators/index.rst @@ -12,3 +12,4 @@ VTL-ML - String Operators String pattern replacement/index String pattern location/index String length/index + String distances/index diff --git a/v2.1/src/main/antlr4/org/sdmx/vtl/Vtl.g4 b/v2.1/src/main/antlr4/org/sdmx/vtl/Vtl.g4 index 960e5e98..76bfdb20 100644 --- a/v2.1/src/main/antlr4/org/sdmx/vtl/Vtl.g4 +++ b/v2.1/src/main/antlr4/org/sdmx/vtl/Vtl.g4 @@ -173,6 +173,7 @@ stringOperators: | SUBSTR LPAREN expr (((COMMA startParameter=optionalExpr) (COMMA endParameter=optionalExpr))? | COMMA startParameter=optionalExpr ) RPAREN # substrAtom | REPLACE LPAREN expr COMMA param=expr ( COMMA optionalExpr)? RPAREN # replaceAtom | INSTR LPAREN expr COMMA pattern=expr ( COMMA startParameter=optionalExpr)? (COMMA occurrenceParameter=optionalExpr)? RPAREN # instrAtom + | LEVENSHTEIN LPAREN left=expr COMMA right=expr RPAREN # levenshteinAtom ; stringOperatorsComponent: @@ -180,8 +181,10 @@ stringOperatorsComponent: | SUBSTR LPAREN exprComponent (((COMMA startParameter=optionalExprComponent) (COMMA endParameter=optionalExprComponent))? | COMMA startParameter=optionalExprComponent ) RPAREN # substrAtomComponent | REPLACE LPAREN exprComponent COMMA param=exprComponent ( COMMA optionalExprComponent)? RPAREN # replaceAtomComponent | INSTR LPAREN exprComponent COMMA pattern=exprComponent ( COMMA startParameter=optionalExprComponent)? (COMMA occurrenceParameter=optionalExprComponent)? RPAREN # instrAtomComponent + | LEVENSHTEIN LPAREN leftComponent=exprComponent COMMA rightComponent=exprComponent RPAREN # levenshteinAtomComponent ; + numericOperators: op=(CEIL | FLOOR | ABS | EXP | LN | SQRT) LPAREN expr RPAREN # unaryNumeric | op=(ROUND | TRUNC) LPAREN expr (COMMA optionalExpr)? RPAREN # unaryWithOptionalNumeric diff --git a/v2.1/src/main/antlr4/org/sdmx/vtl/VtlTokens.g4 b/v2.1/src/main/antlr4/org/sdmx/vtl/VtlTokens.g4 index 062aefd2..068ea56f 100644 --- a/v2.1/src/main/antlr4/org/sdmx/vtl/VtlTokens.g4 +++ b/v2.1/src/main/antlr4/org/sdmx/vtl/VtlTokens.g4 @@ -140,6 +140,7 @@ lexer grammar VtlTokens; RTRIM : 'rtrim'; INSTR : 'instr'; REPLACE : 'replace'; + LEVENSHTEIN : 'levenshtein'; CEIL : 'ceil'; FLOOR : 'floor'; SQRT : 'sqrt'; diff --git a/v2.1/src/test/resources/NegativeTests.vtl b/v2.1/src/test/resources/NegativeTests.vtl index b2aca9c4..8f02e569 100644 --- a/v2.1/src/test/resources/NegativeTests.vtl +++ b/v2.1/src/test/resources/NegativeTests.vtl @@ -273,6 +273,12 @@ length() //more than one operands used length(DS_1, "hi") +//no operands used +levenshtein() + +//second operand missing +levenshtein(DS_1) + //second operand missing DS_r := DS_1 + diff --git a/v2.1/src/test/resources/PositiveTests.vtl b/v2.1/src/test/resources/PositiveTests.vtl index f4ac087c..8204cd4d 100644 --- a/v2.1/src/test/resources/PositiveTests.vtl +++ b/v2.1/src/test/resources/PositiveTests.vtl @@ -246,6 +246,10 @@ DS_r := DS_2 [calc Me_10:= length(Me_1), Me_20:=length(Me_2)]; DS_r := length(DS_2); +DS_r := levenshtein(DS_1, "test"); + +DS_r := levenshtein(DS_1, DS_2); + DS_r := + DS_1; DS_r := DS_1 [calc Me_3 := + Me_1 ];