-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add support for xxHash's dispatch Cleanup compile warnings Improve usage message Add submodules for xxHash and uthash
- Loading branch information
Showing
11 changed files
with
185 additions
and
4,501 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,4 @@ | ||
swuniq | ||
*.o | ||
sonar-project.properties | ||
bin/* | ||
bin | ||
benchsuite/corpustest | ||
out/* | ||
out |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
[submodule "uthash"] | ||
path = uthash | ||
url = https://github.com/troydhanson/uthash | ||
[submodule "xxHash"] | ||
path = xxHash | ||
url = https://github.com/Cyan4973/xxHash |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,138 +1,197 @@ | ||
/* | ||
* swuniq - sliding window uniq | ||
* | ||
* MIT License | ||
* | ||
* Copyright (c) 2018 Miguel Terron | ||
* | ||
* Permission is hereby granted, free of charge, to any person obtaining a copy | ||
* of this software and associated documentation files (the "Software"), to deal | ||
* in the Software without restriction, including without limitation the rights | ||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
* copies of the Software, and to permit persons to whom the Software is | ||
* furnished to do so, subject to the following conditions: | ||
* | ||
* The above copyright notice and this permission notice shall be included in all | ||
* copies or substantial portions of the Software. | ||
* | ||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
* SOFTWARE. | ||
* | ||
*/ | ||
* swuniq - sliding window uniq | ||
* | ||
* MIT License | ||
* | ||
* Copyright (c) 2018 Miguel Terron | ||
* | ||
* Permission is hereby granted, free of charge, to any person obtaining a | ||
* copy of this software and associated documentation files (the "Software"), | ||
* to deal in the Software without restriction, including without limitation | ||
* the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
* and/or sell copies of the Software, and to permit persons to whom the | ||
* Software is furnished to do so, subject to the following conditions: | ||
* | ||
* The above copyright notice and this permission notice shall be included in | ||
* all copies or substantial portions of the Software. | ||
* | ||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | ||
* IN THE SOFTWARE. | ||
* | ||
*/ | ||
|
||
/* swuniq : | ||
* TODO: Description | ||
*/ | ||
|
||
|
||
/* ************************************ | ||
* Includes | ||
**************************************/ | ||
#include <stdlib.h> | ||
#include <ctype.h> | ||
#include <err.h> | ||
#include <inttypes.h> | ||
#include <stdbool.h> | ||
#include <stdio.h> | ||
#include <stdlib.h> | ||
#include <string.h> | ||
#include <err.h> | ||
|
||
#include <unistd.h> | ||
#include <inttypes.h> | ||
#include <ctype.h> | ||
|
||
#define XXH_PRIVATE_API | ||
#define XXH_STATIC_LINKING_ONLY | ||
#define XXH_INLINE_ALL | ||
#include "xxhash.h" | ||
|
||
#include "utringbuffer.h" | ||
#include "xxHash/xxhash.h" | ||
|
||
#ifdef XXHSUM_DISPATCH | ||
# include "xxHash/xxh_x86dispatch.h" | ||
#endif | ||
|
||
#include "uthash/src/utringbuffer.h" | ||
|
||
/* makes the next part easier */ | ||
#if defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64) | ||
# define ARCH_X64 1 | ||
# define ARCH_X86 "x86_64" | ||
#elif defined(__i386__) || defined(_M_IX86) || defined(_M_IX86_FP) | ||
# define ARCH_X86 "i386" | ||
#endif | ||
/* Try to detect the architecture. */ | ||
#if defined(ARCH_X86) | ||
# if defined(XXHSUM_DISPATCH) | ||
# define ARCH ARCH_X86 " autoVec" | ||
# elif defined(__AVX512F__) | ||
# define ARCH ARCH_X86 " + AVX512" | ||
# elif defined(__AVX2__) | ||
# define ARCH ARCH_X86 " + AVX2" | ||
# elif defined(__AVX__) | ||
# define ARCH ARCH_X86 " + AVX" | ||
# elif defined(_M_X64) || defined(_M_AMD64) || defined(__x86_64__) \ | ||
|| defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP == 2) | ||
# define ARCH ARCH_X86 " + SSE2" | ||
# else | ||
# define ARCH ARCH_X86 | ||
# endif | ||
#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) | ||
# define ARCH "aarch64 + NEON" | ||
#elif defined(__arm__) || defined(__thumb__) || defined(__thumb2__) || defined(_M_ARM) | ||
/* ARM has a lot of different features that can change xxHash significantly. */ | ||
# if defined(__thumb2__) || (defined(__thumb__) && (__thumb__ == 2 || __ARM_ARCH >= 7)) | ||
# define ARCH_THUMB " Thumb-2" | ||
# elif defined(__thumb__) | ||
# define ARCH_THUMB " Thumb-1" | ||
# else | ||
# define ARCH_THUMB "" | ||
# endif | ||
/* ARMv7 has unaligned by default */ | ||
# if defined(__ARM_FEATURE_UNALIGNED) || __ARM_ARCH >= 7 || defined(_M_ARMV7VE) | ||
# define ARCH_UNALIGNED " + unaligned" | ||
# else | ||
# define ARCH_UNALIGNED "" | ||
# endif | ||
# if defined(__ARM_NEON) || defined(__ARM_NEON__) | ||
# define ARCH_NEON " + NEON" | ||
# else | ||
# define ARCH_NEON "" | ||
# endif | ||
# define ARCH "ARMv" EXPAND_AND_QUOTE(__ARM_ARCH) ARCH_THUMB ARCH_NEON ARCH_UNALIGNED | ||
#elif defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) | ||
# if defined(__GNUC__) && defined(__POWER9_VECTOR__) | ||
# define ARCH "ppc64 + POWER9 vector" | ||
# elif defined(__GNUC__) && defined(__POWER8_VECTOR__) | ||
# define ARCH "ppc64 + POWER8 vector" | ||
# else | ||
# define ARCH "ppc64" | ||
# endif | ||
#elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) | ||
# define ARCH "ppc" | ||
#elif defined(__AVR) | ||
# define ARCH "AVR" | ||
#elif defined(__mips64) | ||
# define ARCH "mips64" | ||
#elif defined(__mips) | ||
# define ARCH "mips" | ||
#elif defined(__s390x__) | ||
# define ARCH "s390x" | ||
#elif defined(__s390__) | ||
# define ARCH "s390" | ||
#else | ||
# define ARCH "unknown" | ||
#endif | ||
static const int g_nbBits = (int)(sizeof(void*)*8); | ||
|
||
/********************************************************************************************************************/ | ||
|
||
unsigned long long hashString(const void* buffer, size_t length) | ||
{ | ||
unsigned long long const seed = 1029384756; | ||
unsigned long long const hash = XXH3_64bits_withSeed(buffer, length, seed); | ||
unsigned long long hashString(const char *data) { | ||
unsigned long long const hash = XXH3_64bits(data, strlen(data)); | ||
return hash; | ||
} | ||
|
||
/* returns 1 if the hash already exists on the ringbuffer */ | ||
int lookup(const unsigned long long hash, const UT_ringbuffer* rbuffer) | ||
{ | ||
int out = 0; | ||
|
||
if (utringbuffer_len(rbuffer) == 0) return(out); | ||
else | ||
{ | ||
//unsigned long long *item; | ||
for (int i=0; i < utringbuffer_len(rbuffer); i++) { | ||
bool lookup(unsigned long long hash, const UT_ringbuffer *rbuffer) { | ||
bool out = false; | ||
|
||
if (utringbuffer_len(rbuffer) == 0) | ||
return (out); | ||
else { | ||
for (unsigned int i = 0; i < utringbuffer_len(rbuffer); i++) { | ||
unsigned long long *item = utringbuffer_eltptr(rbuffer, i); | ||
out = (hash == *item); | ||
if (out) break; | ||
if (out) | ||
break; | ||
} | ||
return(out); | ||
return (out); | ||
} | ||
} | ||
|
||
/********************************************************** | ||
* Main | ||
**********************************************************/ | ||
int main (int argc, char *argv[]){ | ||
int wsize = 10; // Default window size | ||
* Main | ||
**********************************************************/ | ||
int main(int argc, char *argv[]) { | ||
unsigned int wsize = 10; // Default window size | ||
int c; | ||
|
||
while ((c = getopt (argc, argv, "hw:")) != -1) | ||
{ | ||
switch (c) | ||
{ | ||
case 'w': | ||
wsize = strtoumax(optarg, NULL, 10); | ||
break; | ||
case 'h': | ||
default: | ||
fprintf(stderr,"Usage: swuniq [-w N] INPUT\nFilter matching lines (within a configurable window) from INPUT\n(or stdin), writing to stdout.\n\n\t-w N Size of the sliding window to use for deduplication\nNote: By default swuniq will use a window of 10 lines.\n\n"); | ||
exit(1); | ||
while ((c = getopt(argc, argv, "hw:")) != -1) { | ||
switch (c) { | ||
case 'w': | ||
wsize = strtoumax(optarg, NULL, 10); | ||
break; | ||
case 'h': | ||
default: | ||
#define HELP_MESSAGE "swuniq 0.6 by Miguel Terron compiled as %i-bit %s\nFilter matching lines (within a configurable window) from INPUT\n(or stdin), writing to stdout.\n\nUsage: swuniq [-w N] INPUT\n\t-w N Size of the sliding window to use for deduplication\nNote: By default swuniq will use a window of 10 lines.\n", g_nbBits, ARCH | ||
fprintf(stderr, HELP_MESSAGE); | ||
exit(1); | ||
} | ||
} | ||
|
||
// Open file if filename is provided | ||
if(optind < argc) { | ||
if ( freopen(argv[optind], "r", stdin) == NULL) | ||
{ | ||
fprintf(stderr,"Can't open file %s",argv[optind]); | ||
if (optind < argc) { | ||
if (freopen(argv[optind], "r", stdin) == NULL) { | ||
fprintf(stderr, "Can't open file %s", argv[optind]); | ||
exit(1); | ||
} | ||
} | ||
|
||
char *buffer; | ||
size_t bufsize = 6000; | ||
|
||
UT_ringbuffer *history; | ||
UT_icd ut_long_long_icd = {sizeof(long long), NULL, NULL, NULL }; | ||
UT_icd ut_long_long_icd = {sizeof(long long), NULL, NULL, NULL}; | ||
utringbuffer_new(history, wsize, &ut_long_long_icd); | ||
unsigned long long digest; | ||
|
||
buffer = (char *)malloc(bufsize * sizeof(char)); | ||
if( buffer == NULL ) | ||
{ | ||
perror("Unable to allocate buffer"); | ||
exit(1); | ||
} | ||
|
||
while( -1 != getline(&buffer, &bufsize, stdin) ) | ||
{ | ||
digest = hashString(buffer, strlen(buffer)); | ||
if (!lookup(digest,history)) | ||
{ | ||
char* line; | ||
size_t bufsize = sysconf(_SC_PAGESIZE); | ||
unsigned long long digest; | ||
while (-1 != getline(&line, &bufsize, stdin)) { | ||
digest = hashString(line); | ||
if (!lookup(digest, history)) { | ||
utringbuffer_push_back(history, &digest); | ||
printf("%s",buffer); | ||
printf("%s", line); | ||
fflush(stdout); | ||
} | ||
} | ||
|
||
fclose(stdin); | ||
// utringbuffer_free(history); | ||
free(line); | ||
// utringbuffer_free(history); | ||
exit(0); | ||
} |
Oops, something went wrong.