From e10c58e2f45138135dced7e3ecbdbd498af54519 Mon Sep 17 00:00:00 2001 From: Emerson Rocha Date: Mon, 23 May 2022 23:38:02 -0300 Subject: [PATCH] 999999999_54872.py (#37, #39): first attempt to load all public P-Codes to SQLite (still with some off-by-one error) --- README.md | 3 + officinam/999999999/0/999999999_7200235.py | 67 +++++++++++--- officinam/999999999/1603_45_16.sh | 100 +++++++++++++++++++-- 3 files changed, 147 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 1edf8b2..1715392 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,14 @@ # Lexicographī sine fīnibus **[working draft] The Etica.AI + HXL-CPLP [monorepo](https://en.wikipedia.org/wiki/Monorepo) with public domain automation scripts for [practical lexicography](https://en.wikipedia.org/wiki/Lexicography) on selected topics. Goal of both compilation of existing translations ([such as Wikidata](https://www.wikidata.org/wiki/Wikidata:Licensing)) and preparation for new terminology translation initiatives.** + +Namespace explanations at [HXL-CPLP-Vocab_Auxilium-Humanitarium-API/1603_1_1](https://docs.google.com/spreadsheets/d/1ih3ouvx_n8W5ntNcYBqoyZ2NRMdaA0LRg5F9mGriZm4/edit#gid=2095477004. ## Disclaimers diff --git a/officinam/999999999/0/999999999_7200235.py b/officinam/999999999/0/999999999_7200235.py index 2842c3b..9a943ed 100755 --- a/officinam/999999999/0/999999999_7200235.py +++ b/officinam/999999999/0/999999999_7200235.py @@ -125,7 +125,10 @@ '-9:#meta+id|-8:#country+code+v_iso3|-7:#country+code+v_iso2' Work with local COD-AB index (levels) . . . . . . . . . . . . . . . . . . . . . - {0} --methodus='cod_ab_index_levels' + {0} --methodus='cod_ab_index_levels' --punctum-separato-ad-tab + + {0} --methodus='cod_ab_index_levels' --sine-capite \ +--cum-columnis='#item+conceptum+numerordinatio' Process XLSXs from external sources . . . . . . . . . . . . . . . . . . . . . . {0} --methodus=xlsx_metadata 999999/1603/45/16/xlsx/ago.xlsx @@ -541,6 +544,17 @@ def make_args(self, hxl_output=True): const=True, default=False ) + # sine (+ ablative) https://en.wiktionary.org/wiki/sine#Latin + # capite, s, n, ablativus, https://en.wiktionary.org/wiki/caput#Latin + parser.add_argument( + '--sine-capite', + help='Output without header', + metavar="sine_capite", + dest="sine_capite", + action='store_const', + const=True, + default=False + ) # parser.add_argument( # # '--venandum-insectum-est, --debug', @@ -761,6 +775,9 @@ def execute_cli(self, pyargs, stdin=STDIN, _stdout=sys.stdout, data_json_len, data_json_len_uniq, _path)) return self.EXIT_OK + if pyargs.sine_capite: + caput = None + csv_imprimendo(caput, data, punctum_separato) return self.EXIT_OK @@ -814,6 +831,8 @@ def execute_cli(self, pyargs, stdin=STDIN, _stdout=sys.stdout, if pyargs.methodus == 'xlsx_ad_csv': xlsx.praeparatio() caput, data = xlsx.imprimere() + if pyargs.sine_capite: + caput = None csv_imprimendo(caput, data, punctum_separato=punctum_separato) xlsx.finis() @@ -841,6 +860,8 @@ def execute_cli(self, pyargs, stdin=STDIN, _stdout=sys.stdout, # print(type(caput), caput) # print(type(data), data) # raise NotImplementedError('test test') + if pyargs.sine_capite: + caput = None csv_imprimendo(caput, data, punctum_separato=punctum_separato) # print() @@ -1008,27 +1029,45 @@ def hxltm_carricato__cod_ab_levels( Returns: Tuple[list, list]: _description_ """ - columnae = [ + caput_novo = ['#item+conceptum+numerordinatio'] + caput_cum_columnis = [ '#country+code+v_unm49', '#meta+source+cod_ab_level', + '#country+code+v_iso3', + '#country+code+v_iso2' ] - # print(' ooi') + data_novis = [] caput, data = hxltm_cum_aut_sine_columnis_simplicibus( - caput, data, columnae) - # _ordo_novo = [] + caput, data, caput_cum_columnis) + numerordinatio_praefixo = numerordinatio_neo_separatum( numerordinatio_praefixo, ':') - caput_novo = ['#item+conceptum+numerordinatio'] - caput_novo.extend(caput) - data_novis = [] + + caput_novo.extend(caput_cum_columnis) + + data.sort(key=lambda linea: int(linea[0])) + + _numerordinatio__done = [] + for linea in data: - linea_novae = [] - linea_novae.append('{0}:{1}:{2}'.format( - numerordinatio_praefixo, linea[0], linea[1] - )) - linea_novae.extend(linea) - data_novis.append(linea_novae) + for cod_ab_level in range(0, int(linea[1])): + linea_novae = [] + numerordinatio = '{0}:{1}:{2}'.format( + numerordinatio_praefixo, linea[0], cod_ab_level + ) + + if numerordinatio in _numerordinatio__done: + continue + + _numerordinatio__done.append(numerordinatio) + linea_novae.append(numerordinatio) + linea_novae.append(linea[0]) + linea_novae.append(cod_ab_level) + linea_novae.append(linea[2]) + linea_novae.append(linea[3]) + # linea_novae.extend(linea) + data_novis.append(linea_novae) # raise NotImplementedError # return caput, data diff --git a/officinam/999999999/1603_45_16.sh b/officinam/999999999/1603_45_16.sh index 4d05a53..ddd3d38 100755 --- a/officinam/999999999/1603_45_16.sh +++ b/officinam/999999999/1603_45_16.sh @@ -100,8 +100,6 @@ bootstrap_1603_45_16__all() { echo " LIST HERE <${opus_temporibus_temporarium}>" echo "" - - # while IFS=, read -r iso3 source_url; do { # remove read -r to not skip first line @@ -131,7 +129,6 @@ bootstrap_1603_45_16__all() { continue fi - # echo "numerordinatio_praefixo $numerordinatio_praefixo" # bootstrap_1603_45_16__item "1603_45_16_24" "24" "AGO" "AO" "3" "1" "0" bootstrap_1603_45_16__item "$numerordinatio_praefixo" "$unm49" "$v_iso3" "$v_iso2" "$cod_ab_level_max" "1" "0" @@ -142,6 +139,94 @@ bootstrap_1603_45_16__all() { } +####################################### +# Convert the XLSXs to intermediate formats on 999999/1603/45/16 using +# 999999999_7200235.py to 1603/45/16/{cod_ab_level}/ +# +# @TODO: potentially use more than one source (such as IGBE data for BRA) +# instead of direclty from OCHA +# +# Globals: +# ROOTDIR +# +# Arguments: +# est_meta_datapackage +# est_tabulae_sqlite +# est_tabulae_postgresql +# est_graphicus_rdf +# +# Outputs: +# Convert files +####################################### +bootstrap_1603_45_16__apothecae() { + # objectivum_iso3661p1a3="${1:-""}" + est_meta_datapackage="${1:-""}" + est_tabulae_sqlite="${2:-""}" + est_tabulae_postgresql="${3:-""}" + est_graphicus_rdf="${4:-""}" + # est_postgresql="${2:-""}" + + nomen="1603_45_16" + + # echo "${FUNCNAME[0]} ... [$objectivum_iso3661p1a3]" + echo "${FUNCNAME[0]} ... [@TODO]" + opus_temporibus_temporarium="${ROOTDIR}/999999/0/1603_45_16.apothecae.todo.txt" + objectivum_archivum_datapackage="apothecae~${nomen}.datapackage.json" + objectivum_archivum_sqlite="apothecae~${nomen}.sqlite" + # apothecae.datapackage.json + # set -x + "${ROOTDIR}/999999999/0/999999999_7200235.py" \ + --methodus='cod_ab_index_levels' \ + --sine-capite \ + --cum-columnis='#item+conceptum+numerordinatio' \ + >"${opus_temporibus_temporarium}" + # set +x + + ## 2022-05-23: we will skip LSA admin1 for now as it cannot extract + ## number (it use 3-letter P-codes) + # admin1Name_en admin1Pcode + # Maseru LSA + # Butha-Buthe LSB + # Leribe LSC + # (...) + sed -i '/1603:45:16:426:0/d' "${opus_temporibus_temporarium}" + sed -i '/1603:45:16:426:1/d' "${opus_temporibus_temporarium}" + + echo "" + echo " LIST HERE <${opus_temporibus_temporarium}>" + echo "" + + if [ -n "$est_meta_datapackage" ]; then + set -x + "${ROOTDIR}/999999999/0/1603_1.py" \ + --methodus='data-apothecae' \ + --data-apothecae-ex-archivo="${opus_temporibus_temporarium}" \ + --data-apothecae-ad="$objectivum_archivum_datapackage" + set +x + fi + + if [ -n "$est_tabulae_sqlite" ]; then + set -x + "${ROOTDIR}/999999999/0/1603_1.py" \ + --methodus='data-apothecae' \ + --data-apothecae-ex-archivo="${opus_temporibus_temporarium}" \ + --data-apothecae-ad="$objectivum_archivum_sqlite" + set +x + fi + + if [ -n "$est_tabulae_postgresql" ]; then + echo "est_tabulae_postgresql requires specify connection" + echo "skiping for now..." + fi + + if [ -n "$est_graphicus_rdf" ]; then + echo "TODO est_graphicus_rdf" + fi + + # ./999999999/0/1603_1.py --methodus='data-apothecae' --data-apothecae-ex-archivo='999999/0/apothecae-list.txt' --data-apothecae-ad='apothecae.datapackage.json' + +} + ####################################### # Convert the XLSXs to intermediate formats on 999999/1603/45/16 using # 999999999_7200235.py to 1603/45/16/{cod_ab_level}/ @@ -205,7 +290,7 @@ bootstrap_1603_45_16__item() { echo "cod_ab_levels $cod_ab_level_max" - for ((i=0;i<=cod_ab_level_max;i++)); do + for ((i = 0; i <= cod_ab_level_max; i++)); do cod_level="$i" if [ "$_iso3661p1a3_lower" == "bra" ] && [ "$cod_level" == "2" ]; then echo "" @@ -816,12 +901,10 @@ __temp_download_external_cod_data() { # __temp_download_external_cod_data # exit 1 -bootstrap_1603_45_16__all +# bootstrap_1603_45_16__all # bootstrap_999999_1603_45_16_neo "" # bootstrap_999999_1603_45_16_neo "BRA" - -# bootstrap_1603_45_16__item "76" "BRA" -# bootstrap_1603_45_16__item "1603_45_16_24" "24" "AGO" "AO" "1" "0" +bootstrap_1603_45_16__apothecae "1" "1" "" "" exit 1 echo "after here is old scripts that need to be refatored" @@ -957,7 +1040,6 @@ set +x # rapper -g 999999/0/ibge_un_adm2.no1.skos.ttl # rapper --output dot --guess 999999/0/ibge_un_adm2.no1.skos.ttl - #### @TODO: population -------------------------------------------------------- # https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/queries/examples#Countries_sorted_by_population # https://w.wiki/5CDt