From 796d517a9fc42d5a477fd05a233aa0b7c2e242ad Mon Sep 17 00:00:00 2001 From: tsteven4 <13596209+tsteven4@users.noreply.github.com> Date: Mon, 24 Jul 2023 18:28:23 -0600 Subject: [PATCH] improve correctness and speed of duplicate filter. (#1144) * improve correctness and speed of duplicate filter. The duplicate filter could errouneously delete points that were not duplicates if the crc's happened to match. waypt_del(Waypoint*) is inefficent as it requires a search of the list to find the matching waypoint. Support waypt_del with iterators. * retire util_crc.cc * improve duplicate to linear complexity * polish new list creation. * Remove final remnants of 'exported' * Revert "Remove final remnants of 'exported'" This reverts commit 6996e4d7dd4b1812f922edd2c5bd9a3d1bc6fada. --------- Co-authored-by: Robert Lipe --- CMakeLists.txt | 1 - defs.h | 6 +- util_crc.cc => deprecated/util_crc.cc | 0 duplicate.cc | 157 +++++++++----------------- duplicate.h | 13 +-- xmldoc/filters/duplicate.xml | 2 +- 6 files changed, 57 insertions(+), 122 deletions(-) rename util_crc.cc => deprecated/util_crc.cc (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 14eb5ca7a2..465b00573d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -182,7 +182,6 @@ set(SUPPORT src/core/xmltag.cc units.cc util.cc - util_crc.cc vecs.cc waypt.cc xmlgeneric.cc diff --git a/defs.h b/defs.h index f417e52b05..635f5ae12d 100644 --- a/defs.h +++ b/defs.h @@ -492,6 +492,7 @@ class WaypointList : private QList void waypt_del(Waypoint* wpt); // a.k.a. erase() // FIXME: Generally it is inefficient to use an element pointer or reference to define the element to be deleted, use iterator instead, // and/or implement pop_back() a.k.a. removeLast(), and/or pop_front() a.k.a. removeFirst(). + iterator waypt_del(iterator it) {return erase(it);} void del_rte_waypt(Waypoint* wpt); void waypt_compute_bounds(bounds* bounds) const; Waypoint* find_waypt_by_name(const QString& name) const; @@ -1110,11 +1111,6 @@ int parse_distance(const QString& str, double* val, double scale, const char* mo int parse_speed(const char* str, double* val, double scale, const char* module); int parse_speed(const QString& str, double* val, double scale, const char* module); -/* - * From util_crc.c - */ -unsigned long get_crc32(const void* data, int datalen); - /* * Color helpers. */ diff --git a/util_crc.cc b/deprecated/util_crc.cc similarity index 100% rename from util_crc.cc rename to deprecated/util_crc.cc diff --git a/duplicate.cc b/duplicate.cc index 82f1eb929a..fb471154c2 100644 --- a/duplicate.cc +++ b/duplicate.cc @@ -1,7 +1,7 @@ /* exact duplicate point filter utility. - Copyright (C) 2002-2014 Robert Lipe, robertlipe+source@gpsbabel.org + Copyright (C) 2002-2023 Robert Lipe, robertlipe+source@gpsbabel.org This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -22,10 +22,10 @@ #include "duplicate.h" #include // for stable_sort -#include // for snprintf -#include // for memset, strncpy #include // for QDateTime +#include // for QList, QList<>::iterator, QList<>::const_iterator +#include // for QMultiHash #include "defs.h" #include "geocache.h" // for Geocache @@ -34,54 +34,7 @@ #if FILTERS_ENABLED -DuplicateFilter::btree_node* DuplicateFilter::addnode(btree_node* tree, btree_node* newnode, btree_node** oldnode) -{ - btree_node* last = nullptr; - - if (*oldnode) { - *oldnode = nullptr; - } - - if (!tree) { - return (newnode); - } - - btree_node* tmp = tree; - - while (tmp) { - last = tmp; - if (newnode->data < tmp->data) { - tmp = tmp->right; - } else if (newnode->data > tmp->data) { - tmp = tmp->left; - } else { - if (oldnode) { - *oldnode = tmp; - } - return (nullptr); - } - } - - if (newnode->data < last->data) { - last->right = newnode; - } else { - last->left = newnode; - } - - return (tree); -} - -void DuplicateFilter::free_tree(btree_node* tree) -{ - if (tree->left) { - free_tree(tree->left); - } - if (tree->right) { - free_tree(tree->right); - } - xfree(tree); -} - +#define MYNAME "duplicate" /* It looks odd that we have different comparisons for date and index. @@ -111,83 +64,79 @@ one with the smaller index (i.e. the first of those two points that we came across while importing waypoints.) In the (common) case that we have no exported dates, the dates will all -be zero so the sort will end up being an expensive no-op (expensive -because, sadly, quicksort can be O(n^2) on presorted elements.) +be zero so the sort will end up being an expensive no-op. However, the +complexity of this filter is dominated by other concerns. */ +void DuplicateFilter::init() +{ + if (!lcopt && !snopt) { + fatal(MYNAME ": one or both of the shortname and location options are required.\n"); + } +} + void DuplicateFilter::process() { - btree_node* btmp = nullptr; - btree_node* sup_tree = nullptr; - btree_node* oldnode = nullptr; - struct { - char shortname[32]; - char lat[13]; - char lon[13]; - } dupe; - Waypoint* delwpt = nullptr; - - auto htable = *global_waypoint_list; + int delete_flag; // &delete_flag != nullptr + + auto wptlist = *global_waypoint_list; auto compare_lambda = [](const Waypoint* wa, const Waypoint* wb)->bool { return wa->gc_data->exported > wb->gc_data->exported; }; - std::stable_sort(htable.begin(), htable.end(), compare_lambda); + std::stable_sort(wptlist.begin(), wptlist.end(), compare_lambda); - for (Waypoint* waypointp : htable) { - - memset(&dupe, '\0', sizeof(dupe)); - - if (snopt) { - strncpy(dupe.shortname, CSTRc(waypointp->shortname), sizeof(dupe.shortname) - 1); - } + QMultiHash wpthash; + for (Waypoint* waypointp : wptlist) { + waypointp->extra_data = nullptr; + QString key; if (lcopt) { /* The degrees2ddmm stuff is a feeble attempt to * get everything rounded the same way in a precision * that's "close enough" for determining duplicates. */ - snprintf(dupe.lat, sizeof(dupe.lat), "%11.3f", - degrees2ddmm(waypointp->latitude)); - snprintf(dupe.lon, sizeof(dupe.lon), "%11.3f", - degrees2ddmm(waypointp->longitude)); - + key = QStringLiteral("%1%2") + .arg(degrees2ddmm(waypointp->latitude), 11, 'f', 3) + .arg(degrees2ddmm(waypointp->longitude), 11, 'f', 3); + } + + if (snopt) { + key.append(waypointp->shortname); } - unsigned long crc = get_crc32(&dupe, sizeof(dupe)); - - auto* newnode = (btree_node*)xcalloc(sizeof(btree_node), 1); - newnode->data = crc; - newnode->wpt = waypointp; - - btmp = addnode(sup_tree, newnode, &oldnode); + wpthash.insert(key, waypointp); + } - if (btmp == nullptr) { - delete delwpt; - if (correct_coords && oldnode && oldnode->wpt) { - oldnode->wpt->latitude = waypointp->latitude; - oldnode->wpt->longitude = waypointp->longitude; + const QList keys = wpthash.uniqueKeys(); + for (const auto& key : keys) { + const QList values = wpthash.values(key); + if (values.size() > 1) { + Waypoint* wptfirst = values.last(); // first inserted + if (correct_coords) { + Waypoint* wptlast = values.front(); // last inserted + wptfirst->latitude = wptlast->latitude; + wptfirst->longitude = wptlast->longitude; } - delwpt = waypointp; - waypt_del(waypointp); /* collision */ - xfree(newnode); - if (purge_duplicates && oldnode) { - if (oldnode->wpt) { - waypt_del(oldnode->wpt); - delete oldnode->wpt; - oldnode->wpt = nullptr; + for (auto it = values.cbegin(); it != values.cend(); ++it) { + Waypoint* wpt = *it; + if (purge_duplicates || (wpt != wptfirst)) { + wpt->extra_data = &delete_flag; } } - - } else { - sup_tree = btmp; } } - delete delwpt; - - if (sup_tree) { - free_tree(sup_tree); + // For lineary complexity build a new list from the points we keep. + WaypointList oldlist; + waypt_swap(oldlist); + + for (Waypoint* wpt : qAsConst(oldlist)) { + if (wpt->extra_data == nullptr) { + waypt_add(wpt); + } else { + delete wpt; + } } } diff --git a/duplicate.h b/duplicate.h index 0fd5f85cad..d8d165209c 100644 --- a/duplicate.h +++ b/duplicate.h @@ -1,7 +1,7 @@ /* exact duplicate point filter utility. - Copyright (C) 2002-2014 Robert Lipe, robertlipe+source@gpsbabel.org + Copyright (C) 2002-2023 Robert Lipe, robertlipe+source@gpsbabel.org This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -37,6 +37,7 @@ class DuplicateFilter:public Filter { return &args; } + void init() override; void process() override; private: @@ -64,16 +65,6 @@ class DuplicateFilter:public Filter }, }; - struct btree_node { - btree_node* left; - btree_node* right; - unsigned long data; - Waypoint* wpt; - }; - - static btree_node* addnode(btree_node* tree, btree_node* newnode, btree_node** oldnode); - void free_tree(btree_node* tree); - }; #endif #endif // DUPLICATE_H_INCLUDED_ diff --git a/xmldoc/filters/duplicate.xml b/xmldoc/filters/duplicate.xml index 922522f492..9ef4d4be72 100644 --- a/xmldoc/filters/duplicate.xml +++ b/xmldoc/filters/duplicate.xml @@ -4,7 +4,7 @@ short name (traditionally a waypoint's name on the GPS receiver), and/or their location (to a precision of 6 decimals). This filter supports two options that specify how duplicates will be recognized, and . -Generally, at least one of these options is required. +At least one of these options is required. Using the duplicate filter to suppress points with the same