diff --git a/keyvi/include/keyvi/compression/compression_selector.h b/keyvi/include/keyvi/compression/compression_selector.h index 42579627b..72b39387b 100644 --- a/keyvi/include/keyvi/compression/compression_selector.h +++ b/keyvi/include/keyvi/compression/compression_selector.h @@ -25,6 +25,7 @@ #ifndef KEYVI_COMPRESSION_COMPRESSION_SELECTOR_H_ #define KEYVI_COMPRESSION_COMPRESSION_SELECTOR_H_ +#include #include #include @@ -61,8 +62,8 @@ inline CompressionStrategy* compression_strategy(const std::string& name = "") { typedef std::string (*decompress_func_t)(const std::string&); typedef void (CompressionStrategy::*compress_mem_fn_t)(buffer_t*, const char*, size_t); -inline decompress_func_t decompressor_by_code(const std::string& s) { - switch (s[0]) { +inline decompress_func_t decompressor_by_code(const CompressionAlgorithm algorithm) { + switch (algorithm) { case NO_COMPRESSION: TRACE("unpack uncompressed string"); return RawCompressionStrategy::DoDecompress; @@ -73,8 +74,27 @@ inline decompress_func_t decompressor_by_code(const std::string& s) { TRACE("unpack snappy compressed string"); return SnappyCompressionStrategy::DoDecompress; default: - throw std::invalid_argument("Invalid compression code " + - boost::lexical_cast(static_cast(s[0]))); + throw std::invalid_argument("Invalid compression algorithm " + + boost::lexical_cast(static_cast(algorithm))); + } +} + +inline decompress_func_t decompressor_from_string(const std::string& s) { + return decompressor_by_code(static_cast(s[0])); +} + +/** Returns an instance of a compression strategy by enum. */ +inline compression_strategy_t compression_strategy_by_code(const CompressionAlgorithm algorithm) { + switch (algorithm) { + case NO_COMPRESSION: + return std::make_unique(); + case ZLIB_COMPRESSION: + return std::make_unique(); + case SNAPPY_COMPRESSION: + return std::make_unique(); + default: + throw std::invalid_argument("Invalid compression algorithm " + + boost::lexical_cast(static_cast(algorithm))); } } diff --git a/keyvi/include/keyvi/compression/compression_strategy.h b/keyvi/include/keyvi/compression/compression_strategy.h index 533cf5107..cbde5485f 100644 --- a/keyvi/include/keyvi/compression/compression_strategy.h +++ b/keyvi/include/keyvi/compression/compression_strategy.h @@ -26,13 +26,14 @@ #define KEYVI_COMPRESSION_COMPRESSION_STRATEGY_H_ #include +#include #include #include namespace keyvi { namespace compression { -enum CompressionCode { +enum CompressionAlgorithm { NO_COMPRESSION = 0, ZLIB_COMPRESSION = 1, SNAPPY_COMPRESSION = 2, @@ -61,6 +62,12 @@ struct CompressionStrategy { return std::string(buf.data(), buf.size()); } + inline std::string CompressWithoutHeader(const std::string& raw) { + buffer_t buf; + Compress(&buf, raw.data(), raw.size()); + return std::string(buf.data() + 1, buf.size() - 1); + } + /** * By the time this function is called, the length field added in Compress() * will have been removed. @@ -71,6 +78,8 @@ struct CompressionStrategy { virtual std::string name() const = 0; }; +using compression_strategy_t = std::unique_ptr; + /** * A compression strategy that does almost nothing; i.e. it only adds * the length field. @@ -84,12 +93,6 @@ struct RawCompressionStrategy final : public CompressionStrategy { std::memcpy(buffer->data() + 1, raw, raw_size); } - static inline std::string DoCompress(const char* raw, size_t raw_size) { - buffer_t buf; - DoCompress(&buf, raw, raw_size); - return std::string(buf.data(), buf.size()); - } - inline std::string Decompress(const std::string& compressed) { return DoDecompress(compressed); } static inline std::string DoDecompress(const std::string& compressed) { return compressed.substr(1); } diff --git a/keyvi/include/keyvi/compression/snappy_compression_strategy.h b/keyvi/include/keyvi/compression/snappy_compression_strategy.h index cf7868251..aa000c042 100644 --- a/keyvi/include/keyvi/compression/snappy_compression_strategy.h +++ b/keyvi/include/keyvi/compression/snappy_compression_strategy.h @@ -46,12 +46,6 @@ struct SnappyCompressionStrategy final : public CompressionStrategy { buffer->resize(output_length + 1); } - static inline std::string DoCompress(const char* raw, size_t raw_size) { - buffer_t buf; - DoCompress(&buf, raw, raw_size); - return std::string(buf.data(), buf.size()); - } - inline std::string Decompress(const std::string& compressed) { return DoDecompress(compressed); } static std::string DoDecompress(const std::string& compressed) { diff --git a/keyvi/include/keyvi/dictionary/fsa/automata.h b/keyvi/include/keyvi/dictionary/fsa/automata.h index b25bc2b9c..33bdb99ff 100644 --- a/keyvi/include/keyvi/dictionary/fsa/automata.h +++ b/keyvi/include/keyvi/dictionary/fsa/automata.h @@ -394,6 +394,13 @@ class Automata final { return value_store_reader_->GetRawValueAsString(state_value); } + std::string GetMsgPackedValueAsString(uint64_t state_value, + const compression::CompressionAlgorithm compression_algorithm = + compression::CompressionAlgorithm::NO_COMPRESSION) const { + assert(value_store_reader_); + return value_store_reader_->GetMsgPackedValueAsString(state_value, compression_algorithm); + } + std::string GetStatistics() const { return dictionary_properties_->GetStatistics(); } diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/ivalue_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/ivalue_store.h index cee376dc9..e6e43be58 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/ivalue_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/ivalue_store.h @@ -33,6 +33,7 @@ #include #include +#include "keyvi/compression/compression_selector.h" #include "keyvi/dictionary/dictionary_merger_fwd.h" #include "keyvi/dictionary/fsa/internal/value_store_properties.h" #include "keyvi/dictionary/fsa/internal/value_store_types.h" @@ -109,15 +110,39 @@ class IValueStoreReader { * Get Value as string in raw format * * Note: The raw format is an implementation detail of keyvi, not an official binary interface. - * Value store implementers can override this method for performance reasons. + * Value store implementers can override this method with an optimized version. * * @param fsa_value - * @return the value as string without any decompression + * @return the value as binary encoded string */ virtual std::string GetRawValueAsString(uint64_t fsa_value) const { return keyvi::util::EncodeJsonValue(GetValueAsString(fsa_value)); } + /** + * Get Value as msgpack string + * + * Value store implementers can override this method with an optimized version. + * + * @param fsa_value + * @return the value as msgpack string + */ + virtual std::string GetMsgPackedValueAsString(uint64_t fsa_value, + const compression::CompressionAlgorithm compression_algorithm = + compression::CompressionAlgorithm::NO_COMPRESSION) const { + const std::string msgpacked_value = keyvi::util::JsonStringToMsgPack(GetValueAsString(fsa_value)); + + if (compression_algorithm == compression::CompressionAlgorithm::NO_COMPRESSION) { + return msgpacked_value; + } + + // compress the value + const compression::compression_strategy_t compressor = + compression::compression_strategy_by_code(compression_algorithm); + + return compressor->CompressWithoutHeader(msgpacked_value); + } + /** * Get Value as string (for dumping or communication) * diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h index 9d3e5d41b..cf7d6069a 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h @@ -353,6 +353,35 @@ class JsonValueStoreReader final : public IValueStoreReader { return keyvi::util::decodeVarIntString(strings_ + fsa_value); } + std::string GetMsgPackedValueAsString(uint64_t fsa_value, + const compression::CompressionAlgorithm compression_algorithm = + compression::CompressionAlgorithm::NO_COMPRESSION) const override { + size_t value_size; + const char* value_ptr = keyvi::util::decodeVarIntString(strings_ + fsa_value, &value_size); + + if (value_size == 0) { + return std::string(); + } + + if (value_ptr[0] == compression_algorithm) { + return std::string(value_ptr + 1, value_size - 1); + } + + // decompress + const compression::decompress_func_t decompressor = + compression::decompressor_by_code(static_cast(value_ptr[0])); + std::string msgpacked_value = decompressor(std::string(value_ptr, value_size)); + + if (compression_algorithm == compression::CompressionAlgorithm::NO_COMPRESSION) { + return msgpacked_value; + } + // compress + const compression::compression_strategy_t compressor = + compression::compression_strategy_by_code(compression_algorithm); + + return compressor->CompressWithoutHeader(msgpacked_value); + } + std::string GetValueAsString(uint64_t fsa_value) const override { TRACE("JsonValueStoreReader GetValueAsString"); std::string packed_string = keyvi::util::decodeVarIntString(strings_ + fsa_value); diff --git a/keyvi/include/keyvi/dictionary/match.h b/keyvi/include/keyvi/dictionary/match.h index db8b2a684..ee80d990d 100644 --- a/keyvi/include/keyvi/dictionary/match.h +++ b/keyvi/include/keyvi/dictionary/match.h @@ -32,6 +32,7 @@ #include #include +#include "keyvi/compression/compression_strategy.h" #include "keyvi/dictionary/fsa/automata.h" #include "keyvi/util/json_value.h" @@ -196,13 +197,33 @@ struct Match { return fsa_->GetRawValueAsString(state_); } - std::string GetMsgPackedValueAsString() const { - const std::string raw_value = GetRawValueAsString(); - if (raw_value.empty()) { - return raw_value; + std::string GetMsgPackedValueAsString(const compression::CompressionAlgorithm compression_algorithm = + compression::CompressionAlgorithm::NO_COMPRESSION) const { + if (!fsa_) { + if (raw_value_.empty()) { + return raw_value_; + } + + if (raw_value_[0] == compression_algorithm) { + return raw_value_.substr(1); + } + + // decompress + const compression::decompress_func_t decompressor = + compression::decompressor_by_code(static_cast(raw_value_[0])); + std::string msgpacked_value = decompressor(raw_value_); + + if (compression_algorithm == compression::CompressionAlgorithm::NO_COMPRESSION) { + return msgpacked_value; + } + // compress + const compression::compression_strategy_t compressor = + compression::compression_strategy_by_code(compression_algorithm); + + return compressor->CompressWithoutHeader(msgpacked_value); } - const compression::decompress_func_t decompressor = compression::decompressor_by_code(raw_value); - return decompressor(raw_value); + + return fsa_->GetMsgPackedValueAsString(state_, compression_algorithm); } /** diff --git a/keyvi/include/keyvi/util/float_vector_value.h b/keyvi/include/keyvi/util/float_vector_value.h index a36317331..8f9231a96 100644 --- a/keyvi/include/keyvi/util/float_vector_value.h +++ b/keyvi/include/keyvi/util/float_vector_value.h @@ -35,7 +35,7 @@ namespace keyvi { namespace util { inline std::vector DecodeFloatVector(const std::string& encoded_value) { - compression::decompress_func_t decompressor = compression::decompressor_by_code(encoded_value); + compression::decompress_func_t decompressor = compression::decompressor_from_string(encoded_value); std::string unompressed_string_value = decompressor(encoded_value); const size_t vector_size = unompressed_string_value.size() / sizeof(uint32_t); diff --git a/keyvi/include/keyvi/util/json_value.h b/keyvi/include/keyvi/util/json_value.h index 2fb35f3cf..6463dc011 100644 --- a/keyvi/include/keyvi/util/json_value.h +++ b/keyvi/include/keyvi/util/json_value.h @@ -42,7 +42,7 @@ namespace util { /** Decompresses (if needed) and decodes a json value stored in a JsonValueStore. */ inline std::string DecodeJsonValue(const std::string& encoded_value) { - compression::decompress_func_t decompressor = compression::decompressor_by_code(encoded_value); + compression::decompress_func_t decompressor = compression::decompressor_from_string(encoded_value); std::string packed_string = decompressor(encoded_value); TRACE("unpacking %s", packed_string.c_str()); @@ -64,17 +64,7 @@ inline void EncodeJsonValue(std::functionclear(); - rapidjson::Document json_document; - json_document.Parse(raw_value.c_str()); - - if (!json_document.HasParseError()) { - TRACE("Got json"); - msgpack::packer packer(msgpack_buffer); - JsonToMsgPack(json_document, &packer, single_precision_float); - } else { - TRACE("Got a normal string"); - msgpack::pack(msgpack_buffer, raw_value); - } + JsonStringToMsgPack(raw_value, msgpack_buffer, single_precision_float); // compression if (msgpack_buffer->size() > compression_threshold) { long_compress(buffer, msgpack_buffer->data(), msgpack_buffer->size()); diff --git a/keyvi/include/keyvi/util/msgpack_util.h b/keyvi/include/keyvi/util/msgpack_util.h index 8ce3a6ce4..a353f915b 100644 --- a/keyvi/include/keyvi/util/msgpack_util.h +++ b/keyvi/include/keyvi/util/msgpack_util.h @@ -25,6 +25,7 @@ #ifndef KEYVI_UTIL_MSGPACK_UTIL_H_ #define KEYVI_UTIL_MSGPACK_UTIL_H_ #include +#include #include "msgpack.hpp" #include "rapidjson/document.h" @@ -147,6 +148,28 @@ inline void MsgPackDump(Writer* writer, const msgpack::object& o) { } } +inline void JsonStringToMsgPack(const std::string& raw_value, msgpack::v1::sbuffer* msgpack_buffer, + bool single_precision_float) { + rapidjson::Document json_document; + json_document.Parse(raw_value.c_str()); + + if (!json_document.HasParseError()) { + TRACE("Got json"); + msgpack::packer packer(msgpack_buffer); + JsonToMsgPack(json_document, &packer, single_precision_float); + } else { + TRACE("Got a normal string"); + msgpack::pack(msgpack_buffer, raw_value); + } +} + +inline std::string JsonStringToMsgPack(const std::string& raw_value, bool single_precision_float = false) { + msgpack::sbuffer msgpack_buffer; + + JsonStringToMsgPack(raw_value, &msgpack_buffer, single_precision_float); + return std::string(reinterpret_cast(msgpack_buffer.data()), msgpack_buffer.size()); +} + } /* namespace util */ } /* namespace keyvi */ diff --git a/python/src/addons/Match.pyx b/python/src/addons/Match.pyx index 372714270..3c7163ecf 100644 --- a/python/src/addons/Match.pyx +++ b/python/src/addons/Match.pyx @@ -169,7 +169,11 @@ def GetRawValueAsString(self, *args): """deprecated, use get_raw_value_as_string""" - return call_deprecated_method("GetRawValueAsString", "raw_value_as_string", self.raw_value_as_string, *args) + return call_deprecated_method("GetRawValueAsString", "dumps", self.dumps, *args) + + def raw_value_as_string(self, *args): + """deprecated, use get_raw_value_as_string""" + return call_deprecated_method("raw_value_as_string", "dumps", self.dumps, *args) def __bool__(self): return not self.inst.get().IsEmpty() diff --git a/python/src/pxds/compression.pxd b/python/src/pxds/compression.pxd new file mode 100644 index 000000000..5935ef726 --- /dev/null +++ b/python/src/pxds/compression.pxd @@ -0,0 +1,5 @@ +cdef extern from "keyvi/compression/compression_strategy.h" namespace "keyvi::compression": + ctypedef enum CompressionAlgorithm: + NO_COMPRESSION, + ZLIB_COMPRESSION, + SNAPPY_COMPRESSION diff --git a/python/src/pxds/match.pxd b/python/src/pxds/match.pxd index 775c5d289..62f47eb8b 100644 --- a/python/src/pxds/match.pxd +++ b/python/src/pxds/match.pxd @@ -4,6 +4,7 @@ from libcpp.string cimport string as libcpp_utf8_string from libcpp.string cimport string as libcpp_utf8_output_string from libcpp cimport bool from cpython.ref cimport PyObject +from compression cimport CompressionAlgorithm cdef extern from "keyvi/dictionary/match.h" namespace "keyvi::dictionary": cdef cppclass Match: @@ -20,7 +21,8 @@ cdef extern from "keyvi/dictionary/match.h" namespace "keyvi::dictionary": PyObject* GetAttributePy(libcpp_utf8_string) except + nogil # wrap-ignore libcpp_utf8_output_string GetValueAsString() except + # wrap-as:value_as_string libcpp_string GetRawValueAsString() except + # wrap-as:raw_value_as_string - libcpp_string GetMsgPackedValueAsString() except + # wrap-ignore + libcpp_string GetMsgPackedValueAsString() except + # wrap-as:msgpacked_value_as_string + libcpp_string GetMsgPackedValueAsString(CompressionAlgorithm) except + # wrap-as:msgpacked_value_as_string void SetRawValue(libcpp_utf8_string) except + # wrap-ignore void SetAttribute(libcpp_utf8_string, libcpp_utf8_string) except + # wrap-ignore void SetAttribute(libcpp_utf8_string, float) except + # wrap-ignore diff --git a/python/src/py/keyvi/__init__.py b/python/src/py/keyvi/__init__.py index 647e52b97..352602f7e 100644 --- a/python/src/py/keyvi/__init__.py +++ b/python/src/py/keyvi/__init__.py @@ -20,4 +20,4 @@ from keyvi._version import __version__ # global keyvi concepts -from keyvi._core import MatchIterator, Match, loading_strategy_types +from keyvi._core import MatchIterator, Match, loading_strategy_types, CompressionAlgorithm diff --git a/python/tests/match_object_test.py b/python/tests/match_object_test.py index 9c907a2ac..9ed012db5 100644 --- a/python/tests/match_object_test.py +++ b/python/tests/match_object_test.py @@ -2,9 +2,10 @@ # Usage: py.test tests import keyvi +import msgpack from test_tools import tmp_dictionary import warnings - +import zlib from keyvi.compiler import ( JsonDictionaryCompiler, @@ -30,12 +31,13 @@ def test_raw_serialization(): c = JsonDictionaryCompiler({"memory_limit_mb": "10"}) c.add("abc", '{"a" : 2}') c.add("abd", '{"a" : 3}') - with tmp_dictionary(c, 'match_object_json.kv') as d: + with tmp_dictionary(c, "match_object_json.kv") as d: m = d["abc"] assert m.value_as_string() == '{"a":2}' d = m.dumps() m2 = keyvi.Match.loads(d) assert m2.value_as_string() == '{"a":2}' + assert msgpack.loads(m.msgpacked_value_as_string()) == {"a": 2} with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") assert m.GetValueAsString() == '{"a":2}' @@ -63,8 +65,8 @@ def test_unicode_attributes(): def test_bytes_attributes(): m = keyvi.Match() - bytes_key = bytes(u"äöü".encode('utf-8')) - bytes_value = bytes(u"äöüöäü".encode('utf-8')) + bytes_key = bytes("äöü".encode("utf-8")) + bytes_value = bytes("äöüöäü".encode("utf-8")) m[bytes_key] = 22 assert m[bytes_key] == 22 m["k2"] = bytes_value @@ -73,14 +75,14 @@ def test_bytes_attributes(): def test_double_attributes(): m = keyvi.Match() - bytes_key = bytes("abc".encode('utf-8')) + bytes_key = bytes("abc".encode("utf-8")) m[bytes_key] = 42.0 assert m[bytes_key] == 42.0 def test_boolean_attributes(): m = keyvi.Match() - bytes_key = bytes("def".encode('utf-8')) + bytes_key = bytes("def".encode("utf-8")) m[bytes_key] = True assert m[bytes_key] == True @@ -125,44 +127,83 @@ def test_get_value(): c = JsonDictionaryCompiler({"memory_limit_mb": "10"}) c.add("abc", '{"a" : 2}') c.add("abd", '{"a" : 3}') - with tmp_dictionary(c, 'match_object_json.kv') as d: + with tmp_dictionary(c, "match_object_json.kv") as d: m = d["abc"] assert m.value == {"a": 2} m = d["abd"] assert m.value == {"a": 3} + assert msgpack.loads(m.msgpacked_value_as_string()) == {"a": 3} + assert msgpack.loads( + zlib.decompress( + m.msgpacked_value_as_string(keyvi.CompressionAlgorithm.ZLIB_COMPRESSION) + ) + ) == {"a": 3} def test_get_value_int(): c = CompletionDictionaryCompiler({"memory_limit_mb": "10"}) c.add("abc", 42) c.add("abd", 21) - with tmp_dictionary(c, 'match_object_int.kv') as d: + with tmp_dictionary(c, "match_object_int.kv") as d: m = d["abc"] assert m.value == 42 m = d["abd"] assert m.value == 21 + assert msgpack.loads(m.msgpacked_value_as_string()) == 21 + assert ( + msgpack.loads( + zlib.decompress( + m.msgpacked_value_as_string( + keyvi.CompressionAlgorithm.ZLIB_COMPRESSION + ) + ) + ) + == 21 + ) def test_get_value_key_only(): c = KeyOnlyDictionaryCompiler({"memory_limit_mb": "10"}) c.add("abc") c.add("abd") - with tmp_dictionary(c, 'match_object_key_only.kv') as d: + with tmp_dictionary(c, "match_object_key_only.kv") as d: m = d["abc"] - assert m.value == '' + assert m.value == "" m = d["abd"] - assert m.value == '' + assert m.value == "" + assert msgpack.loads(m.msgpacked_value_as_string()) == "" + assert ( + msgpack.loads( + zlib.decompress( + m.msgpacked_value_as_string( + keyvi.CompressionAlgorithm.ZLIB_COMPRESSION + ) + ) + ) + == "" + ) def test_get_value_string(): c = StringDictionaryCompiler({"memory_limit_mb": "10"}) c.add("abc", "aaaaa") c.add("abd", "bbbbb") - with tmp_dictionary(c, 'match_object_string.kv') as d: + with tmp_dictionary(c, "match_object_string.kv") as d: m = d["abc"] assert m.value == "aaaaa" m = d["abd"] assert m.value == "bbbbb" + assert msgpack.loads(m.msgpacked_value_as_string()) == "bbbbb" + assert ( + msgpack.loads( + zlib.decompress( + m.msgpacked_value_as_string( + keyvi.CompressionAlgorithm.ZLIB_COMPRESSION + ) + ) + ) + == "bbbbb" + ) def test_matched_string():