Parallelize test execution and schema checks using Python multiproces…

…sing (#162) * Adding ICU4C running collation tests - first try * Cache ICU4C binaries in GH and locally, only if they don't exist * Install JSON-C dependency if not installed at beginning of CI or e2e script * Fix bash if condition syntax * CPP Collation much better now * Parallelize validations with schema * Add parallel processing for test generation --------- Co-authored-by: Elango Cheran <[email protected]>
unicode-org · Jan 19, 2024 · e7c959e · e7c959e
1 parent acde9b9
commit e7c959e
Show file tree

Hide file tree

Showing 15 changed files with 544 additions and 143 deletions.
diff --git a/executors/cpp/coll.cpp b/executors/cpp/coll.cpp
@@ -23,6 +23,7 @@
 #include <unicode/locid.h>
 #include <unicode/utypes.h>
 #include <unicode/coll.h>
+#include <unicode/tblcoll.h>
 #include <unicode/ucol.h>
 #include <unicode/unistr.h>
 #include <unicode/ustring.h>
@@ -36,12 +37,14 @@ using std::string;
 using icu::Locale;
 using icu::UnicodeString;
 using icu::Collator;
+using icu::RuleBasedCollator;
+
+const string error_message = "error";
 
 /**
  * test_collator  --  process JSON inputs, run comparator, return result
  */
-const string test_collator(json_object *json_in)  //
-{
+const string test_collator(json_object *json_in) {
   UErrorCode status = U_ZERO_ERROR;
 
   json_object *label_obj = json_object_object_get(json_in, "label");
@@ -52,27 +55,59 @@ const string test_collator(json_object *json_in)  //
 
   string string1 = json_object_get_string(str1);
   string string2 = json_object_get_string(str2);
-  // cout << "s1 = " << string1 << " s2 = " << string2 << endl;
-
-  UnicodeString us1 = UnicodeString(string1.c_str()).unescape();
-  UnicodeString us2 = UnicodeString(string2.c_str()).unescape();
 
-  // Check unescaped versions.
-  char char_out1[1000] = "";
-  char char_out2[1000] = "";
-  int32_t chars_out = us1.extract(char_out1, 1000, nullptr, status);
-  chars_out = us2.extract(char_out2, 1000, nullptr, status);
+  // Does this conversion preserve the data?
+  UnicodeString us1 = UnicodeString::fromUTF8(string1);  //(string1.c_str()).unescape();
+  UnicodeString us2 = UnicodeString::fromUTF8(string2);  // .c_str()).unescape();
 
-  // cout << "us1 = " << char_out1 << " us2 = " << char_out2 << endl;
+  string test_result;
 
   json_object *locale_obj = json_object_object_get(json_in, "locale");
   const char *locale_string;
   if (locale_obj) {
     locale_string = json_object_get_string(locale_obj);
   } else {
-    locale_string = nullptr;
+    locale_string = "und";
   }
 
+  // Comparison type
+  json_object *compare_type_obj = json_object_object_get(json_in, "compare_type");
+  string compare_type_string = "";
+  if (compare_type_obj) {
+    compare_type_string = json_object_get_string(compare_type_obj);
+  }
+
+  // Strength of comparison
+  Collator::ECollationStrength strength_type =  Collator::PRIMARY;
+  string strength_string = "";
+
+  json_object *strength_obj = json_object_object_get(json_in, "strength");
+  if (strength_obj) {
+    strength_string = json_object_get_string(strength_obj);
+    if (strength_string == "primary") {
+      strength_type = Collator::PRIMARY;
+    }
+    else if (strength_string == "secondary") {
+      strength_type = Collator:: SECONDARY;
+    }
+    else if (strength_string == "tertiary") {
+      strength_type = Collator::TERTIARY;
+    }
+    else if (strength_string == "quaternary") {
+      strength_type = Collator::QUATERNARY;
+    }
+    else if (strength_string == "IDENTICAL") {
+      strength_type = Collator::IDENTICAL;
+    }
+  }
+
+  // Check for rule-based collation
+  json_object *rules_obj = json_object_object_get(json_in, "rules");
+  string rules_string = "";
+  if (rules_obj) {
+    rules_string = json_object_get_string(rules_obj);
+  }
+  UnicodeString uni_rules = UnicodeString::fromUTF8(rules_string);
 
   // Allow for different levels or types of comparison.
   json_object *compare_type = json_object_object_get(json_in, "compare_type");
@@ -86,36 +121,154 @@ const string test_collator(json_object *json_in)  //
   const int32_t unspecified_length = -1;
   bool coll_result = true;
 
+  // The json test output.
+  json_object *return_json = json_object_new_object();
+  json_object_object_add(return_json, "label", label_obj);
+
+  bool no_error = true;
+  int uni_result;
   // Create a C++ collator and try it.
-  Collator *uni_coll = Collator::createInstance(Locale(locale_string), status);
-  if (ignore_obj) {
-    uni_coll->setAttribute(UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, status);
+
+  Collator *uni_coll = nullptr;
+  RuleBasedCollator *rb_coll = nullptr;
+
+  if (rules_string != "") {
+    char uni_rules_out[1000] = "";
+    int32_t rule_chars_out = uni_rules.extract(uni_rules_out, 1000, nullptr, status);
+    cout << "# RULES string: " << rules_string << endl;
+    // !!! rb_coll = new RuleBasedCollator(uni_rules, strength_type, status);
+    rb_coll = new RuleBasedCollator(uni_rules, status);
+    if (U_FAILURE(status)) {
+      test_result = error_message.c_str();
+      // TODO: report the error in creating the instance
+      cout << "# Error in making RuleBasedCollator: " << label_string << " : " << test_result << endl;
+
+      json_object_object_add(return_json,
+                           "error", json_object_new_string("creat rule based collator"));
+      no_error = false;
+    }
+
+    // cout << "# Calling rb_coll compare" << endl;
+    uni_result = rb_coll->compare(us1, us2, status);
+    if (U_FAILURE(status)) {
+      test_result = error_message.c_str();
+
+      json_object_object_add(return_json,
+                           "error", json_object_new_string("error in rb_coll->compare"));
+      no_error = false;
+      cout << "# Error in rb_coll->compare: " << label_string << " : " << test_result << endl;
+    }
+    // Don't need this anymore.
+    delete rb_coll;
   }
+  else {
+    // Not a rule-based collator.
+    if (locale_string == "") {
+      uni_coll = Collator::createInstance(status);
+    } else {
+      cout << "# Locale set to " << locale_string <<  endl;
+      uni_coll = Collator::createInstance(Locale(locale_string), status);
+    }
 
-  int uni_result = uni_coll->compare(us1, us2);
-  // cout << "UNI_RESULT = " << uni_result << endl;
+    if (U_FAILURE(status)) {
+      test_result = error_message.c_str();
+      json_object_object_add(return_json,
+                           "error", json_object_new_string("error creating collator instance"));
+      no_error = false;
+      cout << "# Error in createInstance: " << label_string << " : " << test_result << endl;
+    }
 
-  // The json test output.
-  json_object *return_json = json_object_new_object();
-  json_object_object_add(return_json, "label", label_obj);
+    if (strength_obj) {
+      // !!!
+      cout << "#   Collator strength = " << strength_string << endl;
+      uni_coll->setStrength(strength_type);
+    }
 
-  int64_t numeric_result = int64_t(uni_result);
-  if (uni_result == UCOL_GREATER) {
-    coll_result = false;
+    if (ignore_obj) {
+      const bool ignore_punctuation_bool = json_object_get_boolean(ignore_obj);
+      if (ignore_punctuation_bool) {
+        uni_coll->setAttribute(UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, status);
+        if (U_FAILURE(status)) {
+          test_result = error_message.c_str();
+          json_object_object_add(return_json,
+                                 "error", json_object_new_string("error setAttribute"));
+          no_error = false;
+          cout << "# Error in setAttribute: " << label_string << " : " << test_result << endl;
+        }
+      }
+    }
 
-    // Include data compared in the failing test
-    json_object_object_add(
-        return_json, "s1", json_object_new_string(string1.c_str()));
-    json_object_object_add(
-        return_json, "s2", json_object_new_string(string2.c_str()));
+    // Just to check the result.
+    UColAttributeValue alternate_value = uni_coll->getAttribute(UCOL_ALTERNATE_HANDLING, status);
 
-    // What was the actual returned value?
-    json_object_object_add(
-        return_json, "compare", json_object_new_int64(numeric_result));
+    // Try two differen APIs
+    int uni_result_utf8 = uni_coll->compareUTF8(string1, string2, status);
+    // This one seems to work better.
+    uni_result = uni_coll->compare(us1, us2, status);
+
+    if (uni_result != uni_result_utf8) {
+      cout << "# UNI_COLL COMPARE Unicode String " << uni_result << " ";
+      cout << "# UNI_COLL COMPARE UTF8 String " << uni_result_utf8 << endl;
+      cout << "# ******* results different in " << label_string << endl;
+    }
+
+    if (U_FAILURE(status)) {
+        json_object_object_add(return_json,
+                               "error", json_object_new_string("error in uni_coll_compare"));
+      no_error = false;
+      cout << "## Error in uni_coll->compare: " << label_string << " : " << error_message.c_str() << endl;
+    }
+    if (uni_coll) {
+      UColAttributeValue alternate_value = uni_coll->getAttribute(UCOL_ALTERNATE_HANDLING, status);
+    }
+    delete uni_coll;
   }
 
-  json_object_object_add(
-      return_json, "result", json_object_new_boolean(coll_result));
+  if (no_error) {
+    int64_t numeric_result = int64_t(uni_result);
+    if (uni_result == UCOL_GREATER) {
+      coll_result = false;
+
+      cout << "# UNI_RESULT: " << label_string << " " << uni_result <<
+          "  s1: " << string1 << " s2: " << string2 << endl;
+
+      // Check unescaped versions.
+      char char_out1[1000] = "";
+      char char_out2[1000] = "";
+      int32_t chars_out = us1.extract(char_out1, 1000, nullptr, status);
+      if (U_FAILURE(status)) {
+        test_result = error_message.c_str();
+        json_object_object_add(return_json,
+                               "error", json_object_new_string("error extracting us1"));
+        cout << "# Error in us1.extract: " << label_string << " : " << test_result << endl;
+      }
+
+      int32_t chars_out2 = us2.extract(char_out2, 1000, nullptr, status);
+      if (U_FAILURE(status)) {
+        test_result = error_message.c_str();
+        // TODO: report the error in creating the instance
+        test_result = error_message.c_str();
+        json_object_object_add(return_json,
+                               "error", json_object_new_string("error extracting us2"));
+        cout << "# Error in us2.extract: " << label_string << " : " << test_result << endl;
+      }
+
+      // Include data compared in the failing test
+      json_object_object_add(
+          return_json, "s1", json_object_new_string(string1.c_str()));
+      json_object_object_add(
+          return_json, "s2", json_object_new_string(string2.c_str()));
+
+      // What was the actual returned value?
+      json_object_object_add(
+          return_json, "compare", json_object_new_int64(numeric_result));
+    } else {
+      coll_result = true;
+    }
+
+    json_object_object_add(
+        return_json, "result", json_object_new_boolean(coll_result));
+  }
 
   return  json_object_to_json_string(return_json);
 }
diff --git a/executors/cpp/main.cpp b/executors/cpp/main.cpp
@@ -79,7 +79,7 @@ int main(int argc, const char** argv)
       // TODO: get from the array of supported tests
       json_object *tests_supported = json_object_new_object();
       json_object *test_array = json_object_new_array();
-      for (int index = 0; index < 5; index ++) {
+      for (int index = 0; index < 4; index ++) {
         json_object_array_add(test_array,
                               json_object_new_string(supported_tests[index].c_str()));
       }

diff --git a/executors/dart_web/out/executor.js b/executors/dart_web/out/executor.js
@@ -192,7 +192,7 @@ rl.on('line', function (line) {
 
         if ('error' in outputLine) {
           // To get the attention of the driver
-          console.log("#!! ERROR in NODE call: " + JSON.stringify(outputLine));
+          console.log("#!! ERROR in DART_WEB: " + test_type + ": " + JSON.stringify(outputLine));
         }
 
         // Send result to stdout for verification

diff --git a/executors/test_strings.txt b/executors/test_strings.txt
@@ -43,7 +43,10 @@
 {"test_type": "collation_short", "label":"0001377","s1":"\\u0009?","s2":"\\u000a!","line":12,"ignorePunctuation":true}
 {"test_type": "collation_short", "label":"00008","s1":"ä","s2":"Ã¤","compare_type":"=","test_description":" simple CEs & expansions","rules":"&\\x01\n<<<\\u0300\n&9<\\x00\n&\\uA00A\\uA00B=\\uA002\n&\\uA00A\\uA00B\\u00050005=\\uA003"}
 
-=======
+{"test_type": "collation_short", "label": "0011232", "s1": "<a", "s2": "<A", "line": 9867, "ignorePunctuation": true}
+{"test_type": "collation_short", "label": "0011232_Aa", "s1": "<A", "s2": "<a", "line": 9867, "ignorePunctuation": true}
+{"test_type": "collation_short", "label": "0011232_aA", "s1": "a", "s2": "A", "line": 9867, "ignorePunctuation": true}        
+
 {"test_type": "lang_names", "label": "01", "language_label": "en", "locale_label": "af"}
 {"test_type": "lang_names", "label": "01", "language_label": "", "locale_label": "fr"}
 {"test_type": "lang_names", "label": "01", "language_label": "de", "locale_label": "fr"}
@@ -120,7 +123,6 @@
 
 {"label":"0598","locale":"zh-TW","skeleton":"compact-short percent sign-accounting-except-zero","input":"91827.3645","options":{"notation":"compact","compactDisplay":"short","style":"unit","unit":"percent","signDisplay":"exceptZero","currencySign":"accounting"}, "test_type": "number_fmt"}
 {"label":"0598","locale":"zh-TW","skeleton":"compact-short percent sign-accounting-except-zero","input":"9182734567890","options":{"notation":"compact","compactDisplay":"short","style":"unit","unit":"percent","signDisplay":"exceptZero","currencySign":"accounting"}, "test_type": "number_fmt"}    
->>>>>>> 8eca805e8c4cd69552153c7137eeea111144d528
 
 
 {"label":"0598","locale":"zh-TW","skeleton":"compact-short percent sign-accounting-except-zero","input":"91827.3645","options":{"notation":"compact","compactDisplay":"short","style":"unit","unit":"percent","signDisplay":"exceptZero","currencySign":"accounting"}, "test_type": "number_fmt"}
@@ -131,6 +133,8 @@
 {"test_type": "number_fmt", "label":"2062_long","locale":"es-MX","skeleton":"currency/EUR unit-width-long .000","input":"91827.3645","options":{"style":"currency","currencyDisplay":"narrowSymbol","currency":"EUR","unitDisplay":"long","maximumFractionDigits":3,"minimumFractionDigits":3}}
 {"test_type": "number_fmt", "label": "0219", "locale": "zh-TW", "skeleton": "scientific/+ee/sign-always percent precision-integer", "input": "0", "options": {"notation": "scientific", "style": "unit", "unit": "percent", "maximumFractionDigits": 0, "minimumFractionDigits": 0, "roundingType": "fractionDigits"}}
 
+    {"test_type": "number_fmt", "label": "0219", "locale": "zh-TW", "skeleton": "scientific/+ee/sign-always", "input": "1234.01", "options": {"notation": "scientific", "style": "unit", "unit": "percent", "maximumFractionDigits": 0, "minimumFractionDigits": 0, "roundingType": "fractionDigits"}}
+
 {"test_type": "number_fmt","label": "0054", "locale": "es-MX", "skeleton": "scientific/+ee/sign-always percent unit-width-narrow", "input": "0", "options": {"notation": "scientific", "conformanceExponent": "+ee", "conformanceSign": "always", "style": "unit", "unit": "percent", "unitDisplay": "narrow", "currencyDisplay": "narrowSymbol", "maximumFractionDigits": 6}}
 
 // Check significant digits
@@ -154,9 +158,31 @@
 {"test_type": "number_fmt", "label":"0648-no-max-fraction","locale":"es-MX","skeleton":"compact-short percent decimal-always","input":"0","options":{"notation":"compact","compactDisplay":"short","style":"unit","unit":"percent","conformanceDecimalAlways":true}}
 {"test_type": "number_fmt", "label":"0649","locale":"es-MX","skeleton":"compact-short percent decimal-always","input":"91827.3645","options":{"notation":"compact","compactDisplay":"short","style":"unit","unit":"percent","conformanceDecimalAlways":true,"maximumFractionDigits":2}}
 {"test_type": "number_fmt", "label":"0649","locale":"es-MX","skeleton":"compact-short percent decimal-always","input":"91827.3645","options":{"notation":"compact","compactDisplay":"short","style":"unit","unit":"percent","conformanceDecimalAlways":true}}
+
 {"test_type": "number_fmt", "label":"5913","op":"format", "locale": "en", "skeleton":".00","input":"123456789","options":{"roundingMode":"halfEven","minimumIntegerDigits":1,"maximumFractionDigits":0,"useGrouping":false}}
 {"test_type": "number_fmt", "label":"5913","op":"format", "locale":"es-MX", "skeleton":"percent .##/@@@+","input":"123456789.9876543210","options":{"roundingMode":"halfEven","minimumIntegerDigits":1,"maximumFractionDigits":0,"useGrouping":false}}
 
+{"test_type": "number_fmt", "label":"5913","op":"format", "locale": "en", "skeleton":".0","input":"123456789","options":{"roundingMode":"halfEven","minimumIntegerDigits":1,"maximumFractionDigits":0,"useGrouping":false}}
+{"test_type": "number_fmt", "label":"5913","op":"format", "locale":"es-MX", "skeleton":"percent .##/@@@+","input":"123456789.9876543210","options":{"roundingMode":"halfEven","minimumIntegerDigits":1,"maximumFractionDigits":0,"useGrouping":false}}
+
+// Skeletons
+{"test_type": "number_fmt", "label":"s1", "pattern": "0.0000E0", "skeleton": "scientific/+e .0000/@+", "input": "1234.01", "options":{}}}
+{"test_type": "number_fmt", "label":"s2", "locale": "en", "skeleton": "integer-width/##00 ./@+", "input":"1.91", "options":{}}}
+{"test_type": "number_fmt", "label":"s", "pattern":"#.#", "skeleton": ".#/@@+", "input":"12.3456", "options":{}}
+{"test_type": "number_fmt", "label":"s", "pattern":"@@@", "skeleton": "@@@", "input":"12.3456", "options":{}}       
+{"test_type": "number_fmt", "label":"s", "pattern":"@@@", "skeleton": "@@@ group-off", "input":"123456", "options":{}}
+{"test_type": "number_fmt", "label":"s", "pattern":"@@@", "skeleton": "@@@ group-off", "input":"0.00123456", "options":{}}    
+
+{"test_type": "number_fmt", "label":"s", "pattern":"@@###", "skeleton": "@@### group-off", "input":"123456000", "options":{}}    
+{"test_type": "number_fmt", "label":"s", "pattern":"@@###", "skeleton": "@@### group-off", "input":".00123456000", "options":{}}    
+{"test_type": "number_fmt", "label":"s", "pattern":"@@###", "skeleton": "@@### group-off", "input":".00123", "options":{}}    
+
+{"test_type": "number_fmt", "label":"s", "pattern":"@@@@E0", "skeleton": "scientific/+e .0000/@@+", "input":".00123", "options":{}}    
+{"test_type": "number_fmt", "label":"s", "pattern":"0.0##E0", "skeleton": "scientific/+e .##/@@+ integer-width/*000", "input":".00123e110", "options":{}}    
+
+{"test_type": "number_fmt", "label":"LONG", "pattern":"#", "skeleton": "@+ group-off", "input":"10000000000000000000000000000000000000000000000000001", "options":{}}    
+10000000000000000000000000000000000000000000000000001
+10000000000000000000000000000000000000000000000000000    
 
 # LOCALE_INFO
 {"test_type":"likely_subtags", "option":"maximize", "locale":"en", "label":"en_max"}

diff --git a/genData100.sh b/genData100.sh
@@ -25,7 +25,6 @@ then
   bash setup.sh
 fi
 
-export TEMP_DIR=TEMP_DATA
 export TEST_LIMIT=100
 
 export TEMP_DIR=TEMP_DATA_100

diff --git a/schema/check_generated_data.py b/schema/check_generated_data.py
@@ -8,6 +8,7 @@
 
 import logging
 import logging.config
+import multiprocessing as mp
 import os.path
 import sys
 
@@ -45,6 +46,7 @@ def main(args):
     logging.debug('test types = %s', ALL_TEST_TYPES)
 
     validator = schema_validator.ConformanceSchemaValidator()
+
     # Todo: use setters to initialize validator
     validator.schema_base = '.'
     validator.test_data_base = test_data_path