Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ go 1.23.7
toolchain go1.24.2

require (
github.com/ClickHouse/ch-go v0.65.1
github.com/HdrHistogram/hdrhistogram-go v1.1.2
github.com/Shopify/sarama v1.37.2
github.com/apache/thrift v0.21.0
Expand Down Expand Up @@ -106,6 +105,8 @@ require (
gopkg.in/yaml.v3 v3.0.1
)

require golang.org/x/tools v0.30.0 // indirect

require (
github.com/IBM/sarama v1.45.1 // indirect
github.com/alecthomas/participle/v2 v2.1.4 // indirect
Expand Down Expand Up @@ -142,8 +143,6 @@ require (
github.com/elastic/lunes v0.1.0 // indirect
github.com/expr-lang/expr v1.17.2 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/go-faster/city v1.0.1 // indirect
github.com/go-faster/errors v0.7.1 // indirect
github.com/go-logr/logr v1.4.2 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
github.com/go-ole/go-ole v1.2.6 // indirect
Expand Down Expand Up @@ -219,7 +218,6 @@ require (
github.com/rs/cors v1.11.1 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/sagikazarmark/locafero v0.7.0 // indirect
github.com/segmentio/asm v1.2.0 // indirect
github.com/shirou/gopsutil/v4 v4.25.3 // indirect
github.com/sourcegraph/conc v0.3.0 // indirect
github.com/spf13/afero v1.12.0 // indirect
Expand Down
8 changes: 0 additions & 8 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@ github.com/AzureAD/microsoft-authentication-library-for-go v1.2.2 h1:XHOnouVk1mx
github.com/AzureAD/microsoft-authentication-library-for-go v1.2.2/go.mod h1:wP83P5OoQ5p6ip3ScPr0BAq0BvuPAvacpEuSzyouqAI=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
github.com/ClickHouse/ch-go v0.65.1 h1:SLuxmLl5Mjj44/XbINsK2HFvzqup0s6rwKLFH347ZhU=
github.com/ClickHouse/ch-go v0.65.1/go.mod h1:bsodgURwmrkvkBe5jw1qnGDgyITsYErfONKAHn05nv4=
github.com/Code-Hex/go-generics-cache v1.5.1 h1:6vhZGc5M7Y/YD8cIUcY8kcuQLB4cHR7U+0KMqAA0KcU=
github.com/Code-Hex/go-generics-cache v1.5.1/go.mod h1:qxcC9kRVrct9rHeiYpFWSoW1vxyillCVzX13KZG8dl4=
github.com/HdrHistogram/hdrhistogram-go v1.1.2 h1:5IcZpTvzydCQeHzK4Ef/D5rrSqwxob0t8PQPMybUNFM=
Expand Down Expand Up @@ -181,10 +179,6 @@ github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S
github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E=
github.com/fxamacker/cbor/v2 v2.7.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ=
github.com/go-faster/city v1.0.1 h1:4WAxSZ3V2Ws4QRDrscLEDcibJY8uf41H6AhXDrNDcGw=
github.com/go-faster/city v1.0.1/go.mod h1:jKcUJId49qdW3L1qKHH/3wPeUstCVpVSXTM6vO3VcTw=
github.com/go-faster/errors v0.7.1 h1:MkJTnDoEdi9pDabt1dpWf7AA8/BaSYZqibYyhZ20AYg=
github.com/go-faster/errors v0.7.1/go.mod h1:5ySTjWFiphBs07IKuiL69nxdfd5+fzh1u7FPGZP2quo=
github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU=
github.com/go-kit/log v0.2.1 h1:MRVx0/zhvdseW+Gza6N9rVzU/IVzaeE1SFI4raAhmBU=
github.com/go-kit/log v0.2.1/go.mod h1:NwTd00d/i8cPZ3xOwwiv2PO5MOcx78fFErGNcVmBjv0=
Expand Down Expand Up @@ -576,8 +570,6 @@ github.com/sagikazarmark/locafero v0.7.0 h1:5MqpDsTGNDhY8sGp0Aowyf0qKsPrhewaLSsF
github.com/sagikazarmark/locafero v0.7.0/go.mod h1:2za3Cg5rMaTMoG/2Ulr9AwtFaIppKXTRYnozin4aB5k=
github.com/scaleway/scaleway-sdk-go v1.0.0-beta.30 h1:yoKAVkEVwAqbGbR8n87rHQ1dulL25rKloGadb3vm770=
github.com/scaleway/scaleway-sdk-go v1.0.0-beta.30/go.mod h1:sH0u6fq6x4R5M7WxkoQFY/o7UaiItec0o1LinLCJNq8=
github.com/segmentio/asm v1.2.0 h1:9BQrFxC+YOHJlTlHGkTrFWf59nbL3XnCoFLTwDCI7ys=
github.com/segmentio/asm v1.2.0/go.mod h1:BqMnlJP91P8d+4ibuonYZw9mfnzI9HfxselHZr5aAcs=
github.com/shirou/gopsutil/v4 v4.25.3 h1:SeA68lsu8gLggyMbmCn8cmp97V1TI9ld9sVzAUcKcKE=
github.com/shirou/gopsutil/v4 v4.25.3/go.mod h1:xbuxyoZj+UsgnZrENu3lQivsngRR5BdjbJwf2fv4szA=
github.com/shurcooL/httpfs v0.0.0-20230704072500-f1e31cf0ba5c h1:aqg5Vm5dwtvL+YgDpBcK1ITf3o96N/K7/wsRXQnUTEs=
Expand Down
89 changes: 65 additions & 24 deletions internal/storage/v2/clickhouse/README.md
Original file line number Diff line number Diff line change
@@ -1,37 +1,78 @@
## Clickhouse
# Differences from the implementation in [OTel collector contrib](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/exporter/clickhouseexporter)

### Differences from the implementation in [otel collector contrib](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/exporter/clickhouseexporter)
## Trace Storage Format

#### Trace Storage Format
The most significant difference lies in the handling of **Attributes**.
In the OTel-contrib implementation, everything within the Attributes is converted to
[strings](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/80b3df26b7028a4bbe1eb606a6142cd4df9c3c74/exporter/clickhouseexporter/internal/metrics_model.go#L171-L177):

The most significant difference lies in the handling of **Attributes**. In the OTel-contrib implementation, everything within the Attributes is converted to [strings](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/80b3df26b7028a4bbe1eb606a6142cd4df9c3c74/exporter/clickhouseexporter/internal/metrics_model.go#L171-L177):
The primary reason for this is that it leads to the loss of the original data types and cannot be used directly as query parameters (Clickhouse provides casting functions).
For example, if an attribute has an `int64` value, we might want to perform the following operation:

```golang
func AttributesToMap(attributes pcommon.Map) column.IterableOrderedMap {
return orderedmap.CollectN(func(yield func(string, string) bool) {
for k, v := range attributes.All() {
yield(k, v.AsString())
}
}, attributes.Len())
}
```

The primary reason for this is that it leads to the loss of the original data types and \~\~cannot be used directly as query parameters\~\~ (Clickhouse provides casting functions). For example, if an attribute has an `int64` value, we might want to perform the following operation:

```sql
SELECT * FROM test WHERE resource.attributes['container.restart.count'] > 10
```
SELECT * FROM test WHERE resource.attributes\['container.restart.count'] > 10
```

To address the above issues, the following improvements have been implemented:

* Instead of directly using a Map for storage, the key and value are split into two separate arrays.
* More Columns are used to store values of different types:
* For basic types like bool, double, int, and string, corresponding type array columns are used for storage: `Array(Int64)`, `Array(Bool)`, etc.
* For complex types like slice and map, they are serialized into JSON format strings before storage: `Array(String)`.
Instead of using the Clickhouse [Map](https://clickhouse.com/docs/sql-reference/data-types/map) data type to store the entire `Attributes`,
it is split according to the [Value](https://github.com/open-telemetry/opentelemetry-collector/blob/main/pdata/pcommon/value.go#L17-L29) data type in the OTLP Map.

The OTLP Map is grouped into an array of keys and an array of values according to the [Value](https://github.com/open-telemetry/opentelemetry-collector/blob/7adca809a60fad65a71063f6651f231466a29844/pdata/pcommon/map.go#L15-L19) data type.
For example, suppose an OTLP Map contains two key-value pairs with value types of string and integer: ("server.name" : "jaeger") and ("process.pid", 1234). Then the entire OTLP Map will be divided into 4 parts:
- All the keys of key-value pairs with the string value type are collected into the array `strKeys("server.name")`

- All the values of key-value pairs with the string value type are collected into the array `strValues("jaeger")`

- All the keys of key-value pairs with the integer value type are collected into the array `intKeys("process.pid")`

- All the values of key-value pairs with the integer value type are collected into the array `intValues(1234)`

⚠️**The current solution does not consider complex types** [Slice](https://github.com/open-telemetry/opentelemetry-collector/blob/7adca809a60fad65a71063f6651f231466a29844/pdata/pcommon/slice.go#L15-L22) and [Map](https://github.com/open-telemetry/opentelemetry-collector/blob/7adca809a60fad65a71063f6651f231466a29844/pdata/pcommon/map.go#L15-L19).
The reason is the lack of correct serialization/deserialization methods.
In addition, we have already submitted a corresponding [proposal](https://github.com/open-telemetry/opentelemetry-collector/issues/12826) to the OTel community. Once the community approves this improvement, we can support them quickly.⚠️

The expression form after grouping Attributes:
- For basic types like bool, double, int, string, bytes, we can directly store them as arrays: `Array(Bool)`, `Array(Int64)`, `Array(Float64)`, `Array(String)`, `Array(Array(Uint8))`
- For complex types like slice and map, they are serialized into JSON format strings before storage: `Array(String)`

### How to handle complex types (Map, Slice)

The `Value` type here actually refers to the `pdata` data types in the `otel-collector` pipeline. In our architecture,
the `value_warpper` is responsible for wrapping the Protobuf-generated Go structures (which are the concrete implementation of `pdata`) into the `Value` type.
Although `pdata` itself is based on the OTLP specification, encapsulating it into `Value` via the `value_warpper` creates a higher-level abstraction,
which presents some challenges for directly storing `Value` in ClickHouse. Specifically, when deserializing `Slice` and `Map` data contained within the `Value`,
the fact that JSON cannot natively distinguish whether a `Number` is an integer (`int`) or a floating-point number (`double`) leads to a loss of type information.
Furthermore, directly handling the potentially dynamically nested `pdata` structures within the `Value` can also be quite complex.
Therefore, to ensure the accuracy and completeness of data types in ClickHouse, and to effectively handle these nested telemetry data,
we need to convert the `pdata` data inside `Value` into the standard `OTLP/JSON` format for storage.

### Data Read and Write Methods

The OTel-contrib implementation uses `database/sql` for writing data. Using the provided generic interface is unnecessary; using the client provided by ClickHouse is a better choice.

For write operations and read operations, `clickhouse-go` is used in batch mode for writing traces and retrieving traces.

The main reason for not using `ch-go` to write traces is that `ch-go` doesn't support multi-server writing. It may cause performance bottlenecks.

## Mapping model to DB storage
The table structure is defined: [internal/storage/v2/clickhouse/schema/schema.tmpl](./schema/schema.tmpl).

### Attributes
For `Attributes`, the key-value pairs are split into separate Key and Value columns based on their data type.

The `Value` type here actually refers to the `pdata` data types from the `otel-collector` pipeline. In our architecture, the `value_warpper` is responsible for wrapping the Protobuf-generated Go structures (which are the concrete implementation of `pdata`) into the `Value` type. Although `pdata` itself is based on the OTLP specification, encapsulating it into `Value` via the `value_warpper` creates a higher-level abstraction, which presents some challenges for directly storing `Value` in ClickHouse. Specifically, when deserializing `Slice` and `Map` data contained within the `Value`, the fact that JSON cannot natively distinguish whether a `Number` is an integer (`int`) or a floating-point number (`double`) leads to a loss of type information. Furthermore, directly handling the potentially dynamically nested `pdata` structures within the `Value` can also be quite complex. Therefore, to ensure the accuracy and completeness of data types in ClickHouse, and to effectively handle these nested telemetry data, we need to convert the `pdata` data inside `Value` into the standard `OTLP/JSON` format for storage.
For example, `SpanAttrBoolKey` stores the keys of boolean type attributes, and `SpanAttrBoolValue` stores their corresponding values.
As for `Events` and `Links`, logically, they should be one-to-many `Nested` structures.
Similarly, we use `Nested` to store their Attributes.

#### Data Read and Write Methods
### TraceID,SpanID
`TraceID` is actually a fixed-length `[16]byte`, and `SpanID` is a fixed-length `[8]byte`.
They are represented as `byte` slices in Go but stored using the column `Array(UInt8)` in the database.
Note that the way they represent bytes is different: Clickhouse uses [int8](https://clickhouse.com/docs/sql-reference/data-types/int-uint#integer-aliases), while Go uses uint8.

The OTel-contrib implementation uses `database/sql` for writing data. Using the provided generic interface is unnecessary; using the client provided by Clickhouse is a better choice.
For write operations, `ch-go`'s `chpool` is used in batch mode. For read operations, `clickhouse-go` is used.
### StartTime, Duration
A Span consists of two fields: StartTime and EndTime. Our primary focus is on when a Span is generated and its duration, rather than its endpoint.
Additionally, we frequently use Duration as a query condition, such as finding all traces where the maximum duration is less than or equal to 10 milliseconds.
Consequently, `UnixNanoseconds` is used to represent the duration. Therefore, the duration can be converted to different time units (days, hours, minutes, seconds, etc.).
134 changes: 134 additions & 0 deletions internal/storage/v2/clickhouse/schema/schema.tmpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
CREATE TABLE IF NOT EXISTS otel_traces
(
StartTime DateTime64(9) CODEC(Delta, ZSTD(1)),
TraceID Array(UInt8) CODEC(ZSTD(1)),
SpanID Array(UInt8) CODEC(ZSTD(1)),
ParentSpanID Array(UInt8) CODEC(ZSTD(1)),
TraceState String CODEC(ZSTD(1)),
ServiceName String CODEC(ZSTD(1)),
SpanName String CODEC(ZSTD(1)),
SpanKind String CODEC(ZSTD(1)),
Duration Int64,
StatusCode String CODEC(ZSTD(1)),
StatusMessage String CODEC(ZSTD(1)),

SpanBoolAttrs Nested (
key String,
value Bool
),
SpanDoubleAttrs Nested (
key String,
value Float64
),
SpanIntAttrs Nested (
key String,
value Int64
),
SpanStrAttrs Nested (
key String,
value String
),
SpanBytesAttrs Nested (
key String,
value Array(UInt8)
),

ScopeName String CODEC(ZSTD(1)),
ScopeVersion String CODEC(ZSTD(1)),
ScopeBoolAttrs Nested (
key String,
value Bool
),
ScopeDoubleAttrs Nested (
key String,
value Float64
),
ScopeIntAttrs Nested (
key String,
value Int64
),
ScopeStrAttrs Nested (
key String,
value String
),
ScopeBytesAttrs Nested (
key String,
value Array(UInt8)
),

ResourceBoolAttrs Nested (
key String,
value Bool
),
ResourceDoubleAttrs Nested (
key String,
value Float64
),
ResourceIntAttrs Nested (
key String,
value Int64
),
ResourceStrAttrs Nested (
key String,
value String
),
ResourceBytesAttrs Nested (
key String,
value Array(UInt8)
),

Events Nested(
Name String,
Timestamp DateTime64(9),
BoolAttrs Nested (
key String,
value Bool
),
DoubleAttrs Nested (
key String,
value Float64
),
IntAttrs Nested (
key String,
value Int64
),
StrAttrs Nested (
key String,
value String
),
BytesAttrs Nested (
key String,
value Array(UInt8)
)
),

Links Nested(
TraceID Array(UInt8),
SpanID Array(UInt8),
TraceState String,
BoolAttrs Nested (
key String,
value Bool
),
DoubleAttrs Nested (
key String,
value Float64
),
IntAttrs Nested (
key String,
value Int64
),
StrAttrs Nested (
key String,
value String
),
BytesAttrs Nested (
key String,
value Array(UInt8)
)
),
)
ENGINE = MergeTree()
PARTITION BY toDate(StartTime)
ORDER BY (SpanName, toUnixTimestamp(StartTime), TraceID)
SETTINGS index_granularity=8192, ttl_only_drop_parts = 1;
Loading
Loading