Skip to content

Commit c3bb113

Browse files
committed
bz2 and gz input file support, options --stats and --verbose for more output, update util
1 parent 291a2ef commit c3bb113

File tree

8 files changed

+152
-66
lines changed

8 files changed

+152
-66
lines changed

CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,14 @@ find_package(ZLIB)
2727

2828
if (NOT BZIP2_FOUND)
2929
add_definitions( -DSPATIALJOIN_NO_BZIP2=True )
30+
else()
31+
add_definitions( -DPBUTIL_BZIP2_FOUND=True )
3032
endif()
3133

3234
if (NOT ZLIB_FOUND)
3335
add_definitions( -DSPATIALJOIN_NO_ZLIB=True )
36+
else()
37+
add_definitions( -DPBUTIL_ZLIB_FOUND=True )
3438
endif()
3539

3640
# export compile commands to tools like clang

src/spatialjoin/CMakeLists.txt

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,6 @@
11
file(GLOB_RECURSE SPATIALJOIN_SRC *.cpp)
22
list(REMOVE_ITEM SPATIALJOIN_SRC TestMain.cpp)
3-
4-
if(NOT CMAKE_TESTING_ENABLED)
5-
list(FILTER util_SRC EXCLUDE REGEX ".*/tests/.*")
6-
endif()
3+
list(FILTER SPATIALJOIN_SRC EXCLUDE REGEX ".*/tests/.*")
74

85
set(spatialjoin_main SpatialJoinMain.cpp)
96

@@ -20,7 +17,7 @@ target_include_directories(spatialjoin-dev PUBLIC
2017
$<BUILD_INTERFACE:${SPATIALJOIN_INCLUDE_DIR}>
2118
)
2219

23-
target_link_libraries(spatialjoin spatialjoin-dev pb_util -lpthread)
20+
target_link_libraries(spatialjoin spatialjoin-dev pb_util pb_util_geo -lpthread)
2421

2522
if (BZIP2_FOUND)
2623
target_include_directories(spatialjoin PUBLIC ${BZIP2_INCLUDE_DIR} )

src/spatialjoin/SpatialJoinMain.cpp

Lines changed: 134 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,11 @@ void printHelp(int argc, char** argv) {
4141
<< "\n"
4242
<< "(C) 2023-" << YEAR << " " << COPY << "\n"
4343
<< "Authors: " << AUTHORS << "\n\n"
44-
<< "Usage: " << argv[0] << " [--help] [-h] [input]\n\n"
44+
<< "Usage: " << argv[0] << " [OPTIONS] [INPUT1] [INPUT1]\n\n"
45+
<< "With input from either stdin, or from file(s) [INPUT1] and [INPUT2] "
46+
"(.bz2 or .gz\nsupported). If "
47+
<< "both [INPUT1] and [INPUT2] are given, compute a non-self join with\n"
48+
<< "[INPUT1] on the left side, [INPUT2] on the right side.\n\n"
4549
<< "Allowed options:\n\n"
4650
<< std::setfill(' ') << std::left << "General:\n"
4751
<< std::setw(42) << " -h [ --help ]"
@@ -52,6 +56,11 @@ void printHelp(int argc, char** argv) {
5256
<< "cache directory for intermediate files\n"
5357
<< std::setw(42) << " --de9im"
5458
<< "output DE-9IM relationships\n"
59+
<< std::setw(42) << " --within-distance (default: '')"
60+
<< "if set to non-negative value, only compute for each object\n"
61+
<< std::setw(42) << " "
62+
<< "the objects within the given distance\n\n"
63+
<< std::setfill(' ') << std::left << "Formatting:\n"
5564
<< std::setw(42) << " --prefix (default: '')"
5665
<< "prefix added at the beginning of every relation\n"
5766
<< std::setw(42) << " --intersects (default: ' intersects ')"
@@ -70,12 +79,11 @@ void printHelp(int argc, char** argv) {
7079
<< "separator between crossing geometry IDs\n"
7180
<< std::setw(42) << " --suffix (default: '\\n')"
7281
<< "suffix added at the beginning of every relation\n\n"
73-
<< std::setw(42) << " --within-distance (default: '')"
74-
<< "if set to non-negative value, only compute for each object the "
75-
"objects within the given distance\n\n"
7682
<< std::setfill(' ') << std::left << "Geometric computation:\n"
7783
<< std::setw(42) << " --no-box-ids"
78-
<< "disable box id criteria for contains/covers/intersect computation\n"
84+
<< "disable box id criteria for contains/covers/intersect\n"
85+
<< std::setw(42) << " "
86+
<< "computation\n"
7987
<< std::setw(42) << " --no-surface-area"
8088
<< "disable surface area criteria for polygon contains/covers\n"
8189
<< std::setw(42) << " --no-oriented-envelope"
@@ -94,11 +102,17 @@ void printHelp(int argc, char** argv) {
94102
<< " --num-caches (default: " + std::to_string(NUM_THREADS) + ")"
95103
<< "number of geometry caches (if < --num-threads, syncing\n"
96104
<< std::setw(42)
97-
<< " --cache-max-size (default: " + std::to_string(DEFAULT_CACHE_SIZE) + ")"
105+
<< " --cache-max-size (default: " + std::to_string(DEFAULT_CACHE_SIZE) +
106+
")"
98107
<< "maximum approx. size in bytes of cache per type and thread\n"
99108
<< std::setw(42) << " --no-geometry-checks"
100-
<< "do not compute geometric relations, only report number of "
101-
"candidates\n"
109+
<< "do not compute geometric relations, only report number of\n"
110+
<< std::setw(42) << " "
111+
<< "candidates\n"
112+
<< std::setw(42) << " --stats"
113+
<< "output stats\n"
114+
<< std::setw(42) << " -v [ --verbose ]"
115+
<< "verbose logging\n"
102116
<< std::endl;
103117
}
104118

@@ -134,6 +148,9 @@ int main(int argc, char** argv) {
134148
bool noGeometryChecks = false;
135149
bool computeDE9IM = false;
136150

151+
bool printStats = false;
152+
bool verbose = false;
153+
137154
size_t numThreads = NUM_THREADS;
138155
size_t numCaches = NUM_THREADS;
139156
size_t geomCacheMaxSizeBytes = DEFAULT_CACHE_SIZE;
@@ -194,6 +211,10 @@ int main(int argc, char** argv) {
194211
useFastSweepSkip = false;
195212
} else if (cur == "--use-inner-outer") {
196213
useInnerOuter = true;
214+
} else if (cur == "--stats") {
215+
printStats = true;
216+
} else if (cur == "--verbose" || cur == "-v") {
217+
verbose = true;
197218
} else {
198219
inputFiles.push_back(cur);
199220
}
@@ -265,53 +286,116 @@ int main(int argc, char** argv) {
265286
unsigned char* buf = new unsigned char[CACHE_SIZE];
266287
size_t len;
267288

268-
Sweeper sweeper({numThreads,
269-
numCaches,
270-
geomCacheMaxSizeBytes / (numThreads * 3),
271-
prefix,
272-
intersects,
273-
contains,
274-
covers,
275-
touches,
276-
equals,
277-
overlaps,
278-
crosses,
279-
suffix,
280-
useBoxIds,
281-
useArea,
282-
useOBB,
283-
useDiagBox,
284-
useFastSweepSkip,
285-
useInnerOuter,
286-
noGeometryChecks,
287-
withinDist,
288-
computeDE9IM,
289-
{},
290-
[](const std::string& s) { LOGTO(INFO, std::cerr) << s; },
291-
[](const std::string& s) { std::cerr << s; },
292-
{},
293-
{}},
294-
cache, output);
295-
296-
LOGTO(INFO, std::cerr) << "Parsing input geometries...";
289+
sj::SweeperCfg sweeperCfg{numThreads,
290+
numCaches,
291+
geomCacheMaxSizeBytes / (numThreads * 3),
292+
prefix,
293+
intersects,
294+
contains,
295+
covers,
296+
touches,
297+
equals,
298+
overlaps,
299+
crosses,
300+
suffix,
301+
useBoxIds,
302+
useArea,
303+
useOBB,
304+
useDiagBox,
305+
useFastSweepSkip,
306+
useInnerOuter,
307+
noGeometryChecks,
308+
withinDist,
309+
computeDE9IM,
310+
{},
311+
{},
312+
{},
313+
{},
314+
{}};
315+
316+
if (printStats)
317+
sweeperCfg.statsCb = [](const std::string& s) { std::cerr << s; };
318+
319+
if (verbose)
320+
sweeperCfg.logCb = [](const std::string& s) {
321+
LOGTO(INFO, std::cerr) << s;
322+
};
323+
324+
Sweeper sweeper(sweeperCfg, cache, output);
325+
326+
sweeper.log("Parsing input geometries...");
297327
auto ts = TIME();
298328

299329
sj::WKTParser parser(&sweeper, NUM_THREADS);
300330

301331
if (!inputFiles.empty()) {
302332
if (inputFiles.size() > 2) {
303-
std::cerr << "Either 1 input files (for self join), or 2 input files (for non-self join) can be provided." << std::endl;
333+
std::cerr << "Either 1 input files (for self join), or 2 input files "
334+
"(for non-self join) can be provided."
335+
<< std::endl;
304336
exit(1);
305337
}
306338
for (size_t i = 0; i < inputFiles.size(); i++) {
307-
int f = open(inputFiles[i].c_str(), O_RDONLY);
339+
if (util::endsWith(inputFiles[i], ".bz2")) {
340+
#ifndef SPATIALJOIN_NO_BZIP2
341+
auto fh = fopen(inputFiles[i].c_str(), "r");
342+
if (!fh) {
343+
std::cerr << "Could not open input file " << inputFiles[i]
344+
<< std::endl;
345+
exit(1);
346+
}
347+
int err;
348+
BZFILE* f = BZ2_bzReadOpen(&err, fh, 0, 0, NULL, 0);
349+
if (!f || err != BZ_OK) {
350+
std::cerr << "Could not open input file " << inputFiles[i]
351+
<< std::endl;
352+
exit(1);
353+
}
354+
while ((len = util::bz2readAll(f, buf, CACHE_SIZE)) > 0) {
355+
parser.parse(reinterpret_cast<char*>(buf), len, i != 0);
356+
}
308357

309-
if (f < 0) {
310-
throw std::runtime_error("Could not open input file " + inputFiles[i]);
311-
}
358+
BZ2_bzReadClose(&err, f);
359+
fclose(fh);
360+
#else
361+
std::cerr << "Could not open input file " << inputFiles[i]
362+
<< ", spatialjoin was compiled without BZip2 support"
363+
<< std::endl;
364+
exit(1);
365+
#endif
366+
} else if (util::endsWith(inputFiles[i], ".gz")) {
367+
#ifndef SPATIALJOIN_NO_ZLIB
368+
gzFile f = gzopen(inputFiles[i].c_str(), "r");
369+
if (f == Z_NULL) {
370+
std::cerr << "Could not open input file " << inputFiles[i]
371+
<< std::endl;
372+
exit(1);
373+
}
374+
while ((len = util::zreadAll(f, buf, CACHE_SIZE)) > 0) {
375+
parser.parse(reinterpret_cast<char*>(buf), len, i != 0);
376+
}
377+
378+
gzclose(f);
379+
#else
380+
std::cerr << "Could not open input file " << inputFiles[i]
381+
<< ", spatialjoin was compiled without gzip support"
382+
<< std::endl;
383+
exit(1);
384+
#endif
385+
} else {
386+
int f = open(inputFiles[i].c_str(), O_RDONLY);
387+
388+
if (f < 0) {
389+
std::cerr << "Could not open input file " << inputFiles[i]
390+
<< std::endl;
391+
exit(1);
392+
}
393+
394+
while ((len = util::readAll(f, buf, CACHE_SIZE)) > 0) {
395+
parser.parse(reinterpret_cast<char*>(buf), len, i != 0);
396+
}
312397

313-
while ((len = util::readAll(f, buf, CACHE_SIZE)) > 0) {
314-
parser.parse(reinterpret_cast<char*>(buf), len, i != 0);
398+
close(f);
315399
}
316400
}
317401
} else {
@@ -322,20 +406,20 @@ int main(int argc, char** argv) {
322406

323407
parser.done();
324408

325-
LOGTO(INFO, std::cerr) << "Done parsing ("
326-
<< TOOK(ts) / 1000000000.0 << "s).";
409+
sweeper.log("Done parsing (" + std::to_string(TOOK(ts) / 1000000000.0) +
410+
"s).");
327411
ts = TIME();
328412

329-
LOGTO(INFO, std::cerr) << "Sorting sweep events...";
413+
sweeper.log("Sorting sweep events...");
330414

331415
sweeper.flush();
332416

333-
LOGTO(INFO, std::cerr) << "done (" << TOOK(ts) / 1000000000.0 << "s).";
417+
sweeper.log("done (" + std::to_string(TOOK(ts) / 1000000000.0) + "s).");
334418

335-
LOGTO(INFO, std::cerr) << "Sweeping...";
419+
sweeper.log("Sweeping...");
336420
ts = TIME();
337421
sweeper.sweep();
338-
LOGTO(INFO, std::cerr) << "done (" << TOOK(ts) / 1000000000.0 << "s).";
422+
sweeper.log("done (" + std::to_string(TOOK(ts) / 1000000000.0) + "s).");
339423

340424
delete[] buf;
341425
}

src/spatialjoin/Sweeper.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,8 @@ class Sweeper {
252252
}
253253
}
254254

255+
void log(const std::string& msg);
256+
255257
util::geo::I32Box add(const util::geo::I32MultiPolygon& a,
256258
const std::string& gid, bool side,
257259
WriteBatch& batch) const;
@@ -541,8 +543,6 @@ class Sweeper {
541543
void prepareOutputFiles();
542544
void flushOutputFiles();
543545

544-
void log(const std::string& msg);
545-
546546
bool notOverlaps(const std::string& a, const std::string& b);
547547
bool notTouches(const std::string& a, const std::string& b);
548548
bool notCrosses(const std::string& a, const std::string& b);

src/spatialjoin/WKTParse.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ WKTParser::WKTParser(sj::Sweeper *sweeper, size_t numThreads)
1919
}
2020
}
2121

22+
// _____________________________________________________________________________
2223
void WKTParser::processQueue(size_t t) {
2324
ParseBatch batch;
2425
while ((batch = _jobs.get()).size()) {

src/spatialjoin/WKTParse.h

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -137,35 +137,35 @@ class WKTParserBase {
137137
} else {
138138
auto wktType = getWKTType(c, &c);
139139
if (wktType == util::geo::WKTType::POINT) {
140-
const auto &point = pointFromWKT<int32_t>(c, 0, &projFunc);
140+
const auto &point = pointFromWKTProj<int32_t>(c, 0, &projFunc);
141141
_bboxes[t] = util::geo::extendBox(_sweeper->add(point, id, side, batch),
142142
_bboxes[t]);
143143
} else if (wktType == util::geo::WKTType::MULTIPOINT) {
144-
const auto &mp = multiPointFromWKT<int32_t>(c, 0, &projFunc);
144+
const auto &mp = multiPointFromWKTProj<int32_t>(c, 0, &projFunc);
145145
if (mp.size() != 0)
146146
_bboxes[t] = util::geo::extendBox(_sweeper->add(mp, id, side, batch),
147147
_bboxes[t]);
148148
} else if (wktType == util::geo::WKTType::LINESTRING) {
149-
const auto &line = lineFromWKT<int32_t>(c, 0, &projFunc);
149+
const auto &line = lineFromWKTProj<int32_t>(c, 0, &projFunc);
150150
if (line.size() > 1)
151151
_bboxes[t] = util::geo::extendBox(
152152
_sweeper->add(line, id, side, batch), _bboxes[t]);
153153
} else if (wktType == util::geo::WKTType::MULTILINESTRING) {
154-
const auto &ml = multiLineFromWKT<int32_t>(c, 0, &projFunc);
154+
const auto &ml = multiLineFromWKTProj<int32_t>(c, 0, &projFunc);
155155
_bboxes[t] = util::geo::extendBox(_sweeper->add(ml, id, side, batch),
156156
_bboxes[t]);
157157
} else if (wktType == util::geo::WKTType::POLYGON) {
158-
const auto &poly = polygonFromWKT<int32_t>(c, 0, &projFunc);
158+
const auto &poly = polygonFromWKTProj<int32_t>(c, 0, &projFunc);
159159
if (poly.getOuter().size() > 1)
160160
_bboxes[t] = util::geo::extendBox(
161161
_sweeper->add(poly, id, side, batch), _bboxes[t]);
162162
} else if (wktType == util::geo::WKTType::MULTIPOLYGON) {
163-
const auto &mp = multiPolygonFromWKT<int32_t>(c, 0, &projFunc);
163+
const auto &mp = multiPolygonFromWKTProj<int32_t>(c, 0, &projFunc);
164164
if (mp.size())
165165
_bboxes[t] = util::geo::extendBox(_sweeper->add(mp, id, side, batch),
166166
_bboxes[t]);
167167
} else if (wktType == util::geo::WKTType::COLLECTION) {
168-
const auto &col = collectionFromWKT<int32_t>(c, 0, &projFunc);
168+
const auto &col = collectionFromWKTProj<int32_t>(c, 0, &projFunc);
169169

170170
size_t numGeoms = 0;
171171
for (const auto &a : col) {
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
add_executable(spatialjoinTest TestMain.cpp)
2-
target_link_libraries(spatialjoinTest spatialjoin-dev pb_util ${BZIP2_LIBRARIES} -lpthread)
2+
target_link_libraries(spatialjoinTest spatialjoin-dev pb_util pb_util_geo ${BZIP2_LIBRARIES} -lpthread)

0 commit comments

Comments
 (0)